rewrite a href URLs for outline pages

This commit is contained in:
Kevin Pham
2023-11-30 19:45:51 -06:00
parent cd891d88a8
commit 0043b67d20
2 changed files with 58 additions and 45 deletions

View File

@@ -103,7 +103,7 @@ type ProxyChain struct {
Ruleset *ruleset.RuleSet Ruleset *ruleset.RuleSet
debugMode bool debugMode bool
abortErr error abortErr error
_apiPrefix string APIPrefix string
} }
// a ProxyStrategy is a pre-built proxychain with purpose-built defaults // a ProxyStrategy is a pre-built proxychain with purpose-built defaults
@@ -172,9 +172,8 @@ func (chain *ProxyChain) AddResponseModifications(mods ...ResponseModification)
// WithAPIPath trims the path during URL extraction. // WithAPIPath trims the path during URL extraction.
// example: using path = "api/outline/", a path like "http://localhost:8080/api/outline/https://example.com" becomes "https://example.com" // example: using path = "api/outline/", a path like "http://localhost:8080/api/outline/https://example.com" becomes "https://example.com"
func (chain *ProxyChain) WithAPIPath(path string) *ProxyChain { func (chain *ProxyChain) WithAPIPath(path string) *ProxyChain {
fmt.Println("===================") chain.APIPrefix = path
fmt.Printf("set path %s\n", path) chain.APIPrefix = strings.TrimSuffix(chain.APIPrefix, "*")
chain._apiPrefix = path
return chain return chain
} }
@@ -268,9 +267,9 @@ func (chain *ProxyChain) extractURL() (*url.URL, error) {
fmt.Println("XXXXXXXXXXXXXXXX") fmt.Println("XXXXXXXXXXXXXXXX")
fmt.Println(reqURL) fmt.Println(reqURL)
fmt.Println(chain._apiPrefix) fmt.Println(chain.APIPrefix)
reqURL = strings.TrimPrefix(reqURL, chain._apiPrefix) reqURL = strings.TrimPrefix(reqURL, chain.APIPrefix)
// sometimes client requests doubleroot '//' // sometimes client requests doubleroot '//'
// there is a bug somewhere else, but this is a workaround until we find it // there is a bug somewhere else, but this is a workaround until we find it
@@ -507,43 +506,12 @@ func (chain *ProxyChain) Execute() error {
} }
// in case api user did not set or forward content-type, we do it for them // in case api user did not set or forward content-type, we do it for them
/* if chain.Context.Get("content-type") == "" {
if chain.Context.Get("content-type") == "" { chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
chain.Context.Set("content-type", chain.Response.Header.Get("content-type")) }
}
*/
// Return request back to client // Return request back to client
return chain.Context.SendStream(body) return chain.Context.SendStream(body)
// return chain.Context.SendStream(body) // return chain.Context.SendStream(body)
} }
func (chain *ProxyChain) ExecuteForOutline() (string, error) {
defer chain._reset()
body, err := chain._execute()
if err != nil {
log.Println(err)
return "", err
}
if chain.Context == nil {
return "", errors.New("no context set")
}
// in case api user did not set or forward content-type, we do it for them
/*
if chain.Context.Get("content-type") == "" {
chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
}
*/
// Capture the HTML content in a variable
htmlContent, err := io.ReadAll(body)
if err != nil {
log.Println(err)
return "", err
}
// Return the HTML content to the client
return string(htmlContent), nil
}

View File

@@ -4,12 +4,15 @@ import (
"bytes" "bytes"
"embed" "embed"
"fmt" "fmt"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"html/template" "html/template"
"io" "io"
"ladder/proxychain" "ladder/proxychain"
"log" "log"
"net/url"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
//"github.com/go-shiori/dom" //"github.com/go-shiori/dom"
"github.com/markusmobius/go-trafilatura" "github.com/markusmobius/go-trafilatura"
@@ -35,7 +38,7 @@ func GenerateReadableOutline() proxychain.ResponseModification {
// 1. extract dom contents using reading mode algo // 1. extract dom contents using reading mode algo
// =========================================================== // ===========================================================
opts := trafilatura.Options{ opts := trafilatura.Options{
IncludeImages: true, IncludeImages: false,
IncludeLinks: true, IncludeLinks: true,
FavorRecall: true, FavorRecall: true,
Deduplicate: true, Deduplicate: true,
@@ -55,6 +58,8 @@ func GenerateReadableOutline() proxychain.ResponseModification {
// render DOM to string without H1 title // render DOM to string without H1 title
removeFirstH1(extract.ContentNode) removeFirstH1(extract.ContentNode)
// rewrite all links to stay on /outline/ path
rewriteHrefLinks(extract.ContentNode, chain.Context.BaseURL(), chain.APIPrefix)
var b bytes.Buffer var b bytes.Buffer
html.Render(&b, extract.ContentNode) html.Render(&b, extract.ContentNode)
distilledHTML := b.String() distilledHTML := b.String()
@@ -62,11 +67,10 @@ func GenerateReadableOutline() proxychain.ResponseModification {
// populate template parameters // populate template parameters
data := map[string]interface{}{ data := map[string]interface{}{
"Success": true, "Success": true,
"Footer": extract.Metadata.License,
"Image": extract.Metadata.Image, "Image": extract.Metadata.Image,
"Description": extract.Metadata.Description, "Description": extract.Metadata.Description,
"Hostname": extract.Metadata.Hostname, "Hostname": extract.Metadata.Hostname,
"Url": chain.Request.URL, "Url": "/" + chain.Request.URL.String(),
"Title": extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have <h1> titles duplicated? "Title": extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have <h1> titles duplicated?
"Date": extract.Metadata.Date.String(), "Date": extract.Metadata.Date.String(),
"Author": extract.Metadata.Author, "Author": extract.Metadata.Author,
@@ -96,6 +100,10 @@ func GenerateReadableOutline() proxychain.ResponseModification {
} }
} }
// =============================================
// DOM Rendering helpers
// =============================================
func removeFirstH1(n *html.Node) { func removeFirstH1(n *html.Node) {
var recurse func(*html.Node) bool var recurse func(*html.Node) bool
recurse = func(n *html.Node) bool { recurse = func(n *html.Node) bool {
@@ -112,3 +120,40 @@ func removeFirstH1(n *html.Node) {
} }
recurse(n) recurse(n)
} }
func rewriteHrefLinks(n *html.Node, baseURL string, apiPath string) {
u, err := url.Parse(baseURL)
if err != nil {
log.Printf("GenerateReadableOutline :: rewriteHrefLinks error - %s\n", err)
}
apiPath = strings.Trim(apiPath, "/")
proxyURL := fmt.Sprintf("%s://%s", u.Scheme, u.Host)
newProxyURL := fmt.Sprintf("%s/%s", proxyURL, apiPath)
var recurse func(*html.Node) bool
recurse = func(n *html.Node) bool {
if n.Type == html.ElementNode && n.DataAtom == atom.A {
for i := range n.Attr {
attr := n.Attr[i]
if attr.Key != "href" {
continue
}
// rewrite url on a.href: http://localhost:8080/https://example.com -> http://localhost:8080/outline/https://example.com
attr.Val = strings.Replace(attr.Val, proxyURL, newProxyURL, 1)
// rewrite relative URLs too
if strings.HasPrefix(attr.Val, "/") {
attr.Val = fmt.Sprintf("/%s%s", apiPath, attr.Val)
}
n.Attr[i].Val = attr.Val
log.Println(attr.Val)
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
recurse(c)
}
return false
}
recurse(n)
}