From 0043b67d2007963b2dde7873c81263aaa639ee73 Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Thu, 30 Nov 2023 19:45:51 -0600 Subject: [PATCH] rewrite a href URLs for outline pages --- proxychain/proxychain.go | 48 +++------------- .../generate_readable_outline.go | 55 +++++++++++++++++-- 2 files changed, 58 insertions(+), 45 deletions(-) diff --git a/proxychain/proxychain.go b/proxychain/proxychain.go index 1b2053b..f4f36da 100644 --- a/proxychain/proxychain.go +++ b/proxychain/proxychain.go @@ -103,7 +103,7 @@ type ProxyChain struct { Ruleset *ruleset.RuleSet debugMode bool abortErr error - _apiPrefix string + APIPrefix string } // a ProxyStrategy is a pre-built proxychain with purpose-built defaults @@ -172,9 +172,8 @@ func (chain *ProxyChain) AddResponseModifications(mods ...ResponseModification) // WithAPIPath trims the path during URL extraction. // example: using path = "api/outline/", a path like "http://localhost:8080/api/outline/https://example.com" becomes "https://example.com" func (chain *ProxyChain) WithAPIPath(path string) *ProxyChain { - fmt.Println("===================") - fmt.Printf("set path %s\n", path) - chain._apiPrefix = path + chain.APIPrefix = path + chain.APIPrefix = strings.TrimSuffix(chain.APIPrefix, "*") return chain } @@ -268,9 +267,9 @@ func (chain *ProxyChain) extractURL() (*url.URL, error) { fmt.Println("XXXXXXXXXXXXXXXX") fmt.Println(reqURL) - fmt.Println(chain._apiPrefix) + fmt.Println(chain.APIPrefix) - reqURL = strings.TrimPrefix(reqURL, chain._apiPrefix) + reqURL = strings.TrimPrefix(reqURL, chain.APIPrefix) // sometimes client requests doubleroot '//' // there is a bug somewhere else, but this is a workaround until we find it @@ -507,43 +506,12 @@ func (chain *ProxyChain) Execute() error { } // in case api user did not set or forward content-type, we do it for them - /* - if chain.Context.Get("content-type") == "" { - chain.Context.Set("content-type", chain.Response.Header.Get("content-type")) - } - */ + if chain.Context.Get("content-type") == "" { + chain.Context.Set("content-type", chain.Response.Header.Get("content-type")) + } // Return request back to client return chain.Context.SendStream(body) // return chain.Context.SendStream(body) } - -func (chain *ProxyChain) ExecuteForOutline() (string, error) { - defer chain._reset() - body, err := chain._execute() - if err != nil { - log.Println(err) - return "", err - } - if chain.Context == nil { - return "", errors.New("no context set") - } - - // in case api user did not set or forward content-type, we do it for them - /* - if chain.Context.Get("content-type") == "" { - chain.Context.Set("content-type", chain.Response.Header.Get("content-type")) - } - */ - - // Capture the HTML content in a variable - htmlContent, err := io.ReadAll(body) - if err != nil { - log.Println(err) - return "", err - } - - // Return the HTML content to the client - return string(htmlContent), nil -} diff --git a/proxychain/responsemodifers/generate_readable_outline.go b/proxychain/responsemodifers/generate_readable_outline.go index dc621fa..a73e963 100644 --- a/proxychain/responsemodifers/generate_readable_outline.go +++ b/proxychain/responsemodifers/generate_readable_outline.go @@ -4,12 +4,15 @@ import ( "bytes" "embed" "fmt" - "golang.org/x/net/html" - "golang.org/x/net/html/atom" "html/template" "io" "ladder/proxychain" "log" + "net/url" + "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" //"github.com/go-shiori/dom" "github.com/markusmobius/go-trafilatura" @@ -35,7 +38,7 @@ func GenerateReadableOutline() proxychain.ResponseModification { // 1. extract dom contents using reading mode algo // =========================================================== opts := trafilatura.Options{ - IncludeImages: true, + IncludeImages: false, IncludeLinks: true, FavorRecall: true, Deduplicate: true, @@ -55,6 +58,8 @@ func GenerateReadableOutline() proxychain.ResponseModification { // render DOM to string without H1 title removeFirstH1(extract.ContentNode) + // rewrite all links to stay on /outline/ path + rewriteHrefLinks(extract.ContentNode, chain.Context.BaseURL(), chain.APIPrefix) var b bytes.Buffer html.Render(&b, extract.ContentNode) distilledHTML := b.String() @@ -62,11 +67,10 @@ func GenerateReadableOutline() proxychain.ResponseModification { // populate template parameters data := map[string]interface{}{ "Success": true, - "Footer": extract.Metadata.License, "Image": extract.Metadata.Image, "Description": extract.Metadata.Description, "Hostname": extract.Metadata.Hostname, - "Url": chain.Request.URL, + "Url": "/" + chain.Request.URL.String(), "Title": extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have

titles duplicated? "Date": extract.Metadata.Date.String(), "Author": extract.Metadata.Author, @@ -96,6 +100,10 @@ func GenerateReadableOutline() proxychain.ResponseModification { } } +// ============================================= +// DOM Rendering helpers +// ============================================= + func removeFirstH1(n *html.Node) { var recurse func(*html.Node) bool recurse = func(n *html.Node) bool { @@ -112,3 +120,40 @@ func removeFirstH1(n *html.Node) { } recurse(n) } + +func rewriteHrefLinks(n *html.Node, baseURL string, apiPath string) { + u, err := url.Parse(baseURL) + if err != nil { + log.Printf("GenerateReadableOutline :: rewriteHrefLinks error - %s\n", err) + } + apiPath = strings.Trim(apiPath, "/") + proxyURL := fmt.Sprintf("%s://%s", u.Scheme, u.Host) + newProxyURL := fmt.Sprintf("%s/%s", proxyURL, apiPath) + + var recurse func(*html.Node) bool + recurse = func(n *html.Node) bool { + + if n.Type == html.ElementNode && n.DataAtom == atom.A { + for i := range n.Attr { + attr := n.Attr[i] + if attr.Key != "href" { + continue + } + // rewrite url on a.href: http://localhost:8080/https://example.com -> http://localhost:8080/outline/https://example.com + attr.Val = strings.Replace(attr.Val, proxyURL, newProxyURL, 1) + // rewrite relative URLs too + if strings.HasPrefix(attr.Val, "/") { + attr.Val = fmt.Sprintf("/%s%s", apiPath, attr.Val) + } + n.Attr[i].Val = attr.Val + log.Println(attr.Val) + } + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + recurse(c) + } + return false + } + recurse(n) +}