rewrite a href URLs for outline pages

2023-11-30 19:45:51 -06:00
parent cd891d88a8
commit 0043b67d20
2 changed files with 58 additions and 45 deletions
--- a/proxychain/proxychain.go
+++ b/proxychain/proxychain.go
@@ -103,7 +103,7 @@ type ProxyChain struct {
 	Ruleset                   *ruleset.RuleSet
 	debugMode                 bool
 	abortErr                  error
-	_apiPrefix                string
+	APIPrefix                 string
 }
 // a ProxyStrategy is a pre-built proxychain with purpose-built defaults
@@ -172,9 +172,8 @@ func (chain *ProxyChain) AddResponseModifications(mods ...ResponseModification)
 // WithAPIPath trims the path during URL extraction.
 // example: using path = "api/outline/", a path like "http://localhost:8080/api/outline/https://example.com" becomes "https://example.com"
 func (chain *ProxyChain) WithAPIPath(path string) *ProxyChain {
-	fmt.Println("===================")
+	chain.APIPrefix = path
-	fmt.Printf("set path %s\n", path)
+	chain.APIPrefix = strings.TrimSuffix(chain.APIPrefix, "*")
 	chain._apiPrefix = path
 	return chain
 }
@@ -268,9 +267,9 @@ func (chain *ProxyChain) extractURL() (*url.URL, error) {
 	fmt.Println("XXXXXXXXXXXXXXXX")
 	fmt.Println(reqURL)
-	fmt.Println(chain._apiPrefix)
+	fmt.Println(chain.APIPrefix)
-	reqURL = strings.TrimPrefix(reqURL, chain._apiPrefix)
+	reqURL = strings.TrimPrefix(reqURL, chain.APIPrefix)
 	// sometimes client requests doubleroot '//'
 	// there is a bug somewhere else, but this is a workaround until we find it
@@ -507,43 +506,12 @@ func (chain *ProxyChain) Execute() error {
 	}
 	// in case api user did not set or forward content-type, we do it for them
-	/*
+	if chain.Context.Get("content-type") == "" {
-		if chain.Context.Get("content-type") == "" {
+		chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
-			chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
+	}
 		}
 	*/
 	// Return request back to client
 	return chain.Context.SendStream(body)
 	// return chain.Context.SendStream(body)
 }
 func (chain *ProxyChain) ExecuteForOutline() (string, error) {
 	defer chain._reset()
 	body, err := chain._execute()
 	if err != nil {
 		log.Println(err)
 		return "", err
 	}
 	if chain.Context == nil {
 		return "", errors.New("no context set")
 	}
 	// in case api user did not set or forward content-type, we do it for them
 	/*
 		if chain.Context.Get("content-type") == "" {
 			chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
 		}
 	*/
 	// Capture the HTML content in a variable
 	htmlContent, err := io.ReadAll(body)
 	if err != nil {
 		log.Println(err)
 		return "", err
 	}
 	// Return the HTML content to the client
 	return string(htmlContent), nil
 }
--- a/proxychain/responsemodifers/generate_readable_outline.go
+++ b/proxychain/responsemodifers/generate_readable_outline.go
@@ -4,12 +4,15 @@ import (
 	"bytes"
 	"embed"
 	"fmt"
 	"golang.org/x/net/html"
 	"golang.org/x/net/html/atom"
 	"html/template"
 	"io"
 	"ladder/proxychain"
 	"log"
 	"net/url"
 	"strings"
 	"golang.org/x/net/html"
 	"golang.org/x/net/html/atom"
 	//"github.com/go-shiori/dom"
 	"github.com/markusmobius/go-trafilatura"
@@ -35,7 +38,7 @@ func GenerateReadableOutline() proxychain.ResponseModification {
 		// 1. extract dom contents using reading mode algo
 		// ===========================================================
 		opts := trafilatura.Options{
-			IncludeImages:      true,
+			IncludeImages:      false,
 			IncludeLinks:       true,
 			FavorRecall:        true,
 			Deduplicate:        true,
@@ -55,6 +58,8 @@ func GenerateReadableOutline() proxychain.ResponseModification {
 		// render DOM to string without H1 title
 		removeFirstH1(extract.ContentNode)
 		// rewrite all links to stay on /outline/ path
 		rewriteHrefLinks(extract.ContentNode, chain.Context.BaseURL(), chain.APIPrefix)
 		var b bytes.Buffer
 		html.Render(&b, extract.ContentNode)
 		distilledHTML := b.String()
@@ -62,11 +67,10 @@ func GenerateReadableOutline() proxychain.ResponseModification {
 		// populate template parameters
 		data := map[string]interface{}{
 			"Success":     true,
 			"Footer":      extract.Metadata.License,
 			"Image":       extract.Metadata.Image,
 			"Description": extract.Metadata.Description,
 			"Hostname":    extract.Metadata.Hostname,
-			"Url":         chain.Request.URL,
+			"Url":         "/" + chain.Request.URL.String(),
 			"Title":       extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have <h1> titles duplicated?
 			"Date":        extract.Metadata.Date.String(),
 			"Author":      extract.Metadata.Author,
@@ -96,6 +100,10 @@ func GenerateReadableOutline() proxychain.ResponseModification {
 	}
 }
 // =============================================
 // DOM Rendering helpers
 // =============================================
 func removeFirstH1(n *html.Node) {
 	var recurse func(*html.Node) bool
 	recurse = func(n *html.Node) bool {
@@ -112,3 +120,40 @@ func removeFirstH1(n *html.Node) {
 	}
 	recurse(n)
 }
 func rewriteHrefLinks(n *html.Node, baseURL string, apiPath string) {
 	u, err := url.Parse(baseURL)
 	if err != nil {
 		log.Printf("GenerateReadableOutline :: rewriteHrefLinks error - %s\n", err)
 	}
 	apiPath = strings.Trim(apiPath, "/")
 	proxyURL := fmt.Sprintf("%s://%s", u.Scheme, u.Host)
 	newProxyURL := fmt.Sprintf("%s/%s", proxyURL, apiPath)
 	var recurse func(*html.Node) bool
 	recurse = func(n *html.Node) bool {
 		if n.Type == html.ElementNode && n.DataAtom == atom.A {
 			for i := range n.Attr {
 				attr := n.Attr[i]
 				if attr.Key != "href" {
 					continue
 				}
 				// rewrite url on a.href: http://localhost:8080/https://example.com -> http://localhost:8080/outline/https://example.com
 				attr.Val = strings.Replace(attr.Val, proxyURL, newProxyURL, 1)
 				// rewrite relative URLs too
 				if strings.HasPrefix(attr.Val, "/") {
 					attr.Val = fmt.Sprintf("/%s%s", apiPath, attr.Val)
 				}
 				n.Attr[i].Val = attr.Val
 				log.Println(attr.Val)
 			}
 		}
 		for c := n.FirstChild; c != nil; c = c.NextSibling {
 			recurse(c)
 		}
 		return false
 	}
 	recurse(n)
 }