rewrite a href URLs for outline pages
This commit is contained in:
@@ -103,7 +103,7 @@ type ProxyChain struct {
|
|||||||
Ruleset *ruleset.RuleSet
|
Ruleset *ruleset.RuleSet
|
||||||
debugMode bool
|
debugMode bool
|
||||||
abortErr error
|
abortErr error
|
||||||
_apiPrefix string
|
APIPrefix string
|
||||||
}
|
}
|
||||||
|
|
||||||
// a ProxyStrategy is a pre-built proxychain with purpose-built defaults
|
// a ProxyStrategy is a pre-built proxychain with purpose-built defaults
|
||||||
@@ -172,9 +172,8 @@ func (chain *ProxyChain) AddResponseModifications(mods ...ResponseModification)
|
|||||||
// WithAPIPath trims the path during URL extraction.
|
// WithAPIPath trims the path during URL extraction.
|
||||||
// example: using path = "api/outline/", a path like "http://localhost:8080/api/outline/https://example.com" becomes "https://example.com"
|
// example: using path = "api/outline/", a path like "http://localhost:8080/api/outline/https://example.com" becomes "https://example.com"
|
||||||
func (chain *ProxyChain) WithAPIPath(path string) *ProxyChain {
|
func (chain *ProxyChain) WithAPIPath(path string) *ProxyChain {
|
||||||
fmt.Println("===================")
|
chain.APIPrefix = path
|
||||||
fmt.Printf("set path %s\n", path)
|
chain.APIPrefix = strings.TrimSuffix(chain.APIPrefix, "*")
|
||||||
chain._apiPrefix = path
|
|
||||||
return chain
|
return chain
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,9 +267,9 @@ func (chain *ProxyChain) extractURL() (*url.URL, error) {
|
|||||||
|
|
||||||
fmt.Println("XXXXXXXXXXXXXXXX")
|
fmt.Println("XXXXXXXXXXXXXXXX")
|
||||||
fmt.Println(reqURL)
|
fmt.Println(reqURL)
|
||||||
fmt.Println(chain._apiPrefix)
|
fmt.Println(chain.APIPrefix)
|
||||||
|
|
||||||
reqURL = strings.TrimPrefix(reqURL, chain._apiPrefix)
|
reqURL = strings.TrimPrefix(reqURL, chain.APIPrefix)
|
||||||
|
|
||||||
// sometimes client requests doubleroot '//'
|
// sometimes client requests doubleroot '//'
|
||||||
// there is a bug somewhere else, but this is a workaround until we find it
|
// there is a bug somewhere else, but this is a workaround until we find it
|
||||||
@@ -507,43 +506,12 @@ func (chain *ProxyChain) Execute() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// in case api user did not set or forward content-type, we do it for them
|
// in case api user did not set or forward content-type, we do it for them
|
||||||
/*
|
|
||||||
if chain.Context.Get("content-type") == "" {
|
if chain.Context.Get("content-type") == "" {
|
||||||
chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
|
chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
// Return request back to client
|
// Return request back to client
|
||||||
return chain.Context.SendStream(body)
|
return chain.Context.SendStream(body)
|
||||||
|
|
||||||
// return chain.Context.SendStream(body)
|
// return chain.Context.SendStream(body)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (chain *ProxyChain) ExecuteForOutline() (string, error) {
|
|
||||||
defer chain._reset()
|
|
||||||
body, err := chain._execute()
|
|
||||||
if err != nil {
|
|
||||||
log.Println(err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if chain.Context == nil {
|
|
||||||
return "", errors.New("no context set")
|
|
||||||
}
|
|
||||||
|
|
||||||
// in case api user did not set or forward content-type, we do it for them
|
|
||||||
/*
|
|
||||||
if chain.Context.Get("content-type") == "" {
|
|
||||||
chain.Context.Set("content-type", chain.Response.Header.Get("content-type"))
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Capture the HTML content in a variable
|
|
||||||
htmlContent, err := io.ReadAll(body)
|
|
||||||
if err != nil {
|
|
||||||
log.Println(err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the HTML content to the client
|
|
||||||
return string(htmlContent), nil
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -4,12 +4,15 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"embed"
|
"embed"
|
||||||
"fmt"
|
"fmt"
|
||||||
"golang.org/x/net/html"
|
|
||||||
"golang.org/x/net/html/atom"
|
|
||||||
"html/template"
|
"html/template"
|
||||||
"io"
|
"io"
|
||||||
"ladder/proxychain"
|
"ladder/proxychain"
|
||||||
"log"
|
"log"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
"golang.org/x/net/html/atom"
|
||||||
|
|
||||||
//"github.com/go-shiori/dom"
|
//"github.com/go-shiori/dom"
|
||||||
"github.com/markusmobius/go-trafilatura"
|
"github.com/markusmobius/go-trafilatura"
|
||||||
@@ -35,7 +38,7 @@ func GenerateReadableOutline() proxychain.ResponseModification {
|
|||||||
// 1. extract dom contents using reading mode algo
|
// 1. extract dom contents using reading mode algo
|
||||||
// ===========================================================
|
// ===========================================================
|
||||||
opts := trafilatura.Options{
|
opts := trafilatura.Options{
|
||||||
IncludeImages: true,
|
IncludeImages: false,
|
||||||
IncludeLinks: true,
|
IncludeLinks: true,
|
||||||
FavorRecall: true,
|
FavorRecall: true,
|
||||||
Deduplicate: true,
|
Deduplicate: true,
|
||||||
@@ -55,6 +58,8 @@ func GenerateReadableOutline() proxychain.ResponseModification {
|
|||||||
|
|
||||||
// render DOM to string without H1 title
|
// render DOM to string without H1 title
|
||||||
removeFirstH1(extract.ContentNode)
|
removeFirstH1(extract.ContentNode)
|
||||||
|
// rewrite all links to stay on /outline/ path
|
||||||
|
rewriteHrefLinks(extract.ContentNode, chain.Context.BaseURL(), chain.APIPrefix)
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
html.Render(&b, extract.ContentNode)
|
html.Render(&b, extract.ContentNode)
|
||||||
distilledHTML := b.String()
|
distilledHTML := b.String()
|
||||||
@@ -62,11 +67,10 @@ func GenerateReadableOutline() proxychain.ResponseModification {
|
|||||||
// populate template parameters
|
// populate template parameters
|
||||||
data := map[string]interface{}{
|
data := map[string]interface{}{
|
||||||
"Success": true,
|
"Success": true,
|
||||||
"Footer": extract.Metadata.License,
|
|
||||||
"Image": extract.Metadata.Image,
|
"Image": extract.Metadata.Image,
|
||||||
"Description": extract.Metadata.Description,
|
"Description": extract.Metadata.Description,
|
||||||
"Hostname": extract.Metadata.Hostname,
|
"Hostname": extract.Metadata.Hostname,
|
||||||
"Url": chain.Request.URL,
|
"Url": "/" + chain.Request.URL.String(),
|
||||||
"Title": extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have <h1> titles duplicated?
|
"Title": extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have <h1> titles duplicated?
|
||||||
"Date": extract.Metadata.Date.String(),
|
"Date": extract.Metadata.Date.String(),
|
||||||
"Author": extract.Metadata.Author,
|
"Author": extract.Metadata.Author,
|
||||||
@@ -96,6 +100,10 @@ func GenerateReadableOutline() proxychain.ResponseModification {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================
|
||||||
|
// DOM Rendering helpers
|
||||||
|
// =============================================
|
||||||
|
|
||||||
func removeFirstH1(n *html.Node) {
|
func removeFirstH1(n *html.Node) {
|
||||||
var recurse func(*html.Node) bool
|
var recurse func(*html.Node) bool
|
||||||
recurse = func(n *html.Node) bool {
|
recurse = func(n *html.Node) bool {
|
||||||
@@ -112,3 +120,40 @@ func removeFirstH1(n *html.Node) {
|
|||||||
}
|
}
|
||||||
recurse(n)
|
recurse(n)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func rewriteHrefLinks(n *html.Node, baseURL string, apiPath string) {
|
||||||
|
u, err := url.Parse(baseURL)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("GenerateReadableOutline :: rewriteHrefLinks error - %s\n", err)
|
||||||
|
}
|
||||||
|
apiPath = strings.Trim(apiPath, "/")
|
||||||
|
proxyURL := fmt.Sprintf("%s://%s", u.Scheme, u.Host)
|
||||||
|
newProxyURL := fmt.Sprintf("%s/%s", proxyURL, apiPath)
|
||||||
|
|
||||||
|
var recurse func(*html.Node) bool
|
||||||
|
recurse = func(n *html.Node) bool {
|
||||||
|
|
||||||
|
if n.Type == html.ElementNode && n.DataAtom == atom.A {
|
||||||
|
for i := range n.Attr {
|
||||||
|
attr := n.Attr[i]
|
||||||
|
if attr.Key != "href" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// rewrite url on a.href: http://localhost:8080/https://example.com -> http://localhost:8080/outline/https://example.com
|
||||||
|
attr.Val = strings.Replace(attr.Val, proxyURL, newProxyURL, 1)
|
||||||
|
// rewrite relative URLs too
|
||||||
|
if strings.HasPrefix(attr.Val, "/") {
|
||||||
|
attr.Val = fmt.Sprintf("/%s%s", apiPath, attr.Val)
|
||||||
|
}
|
||||||
|
n.Attr[i].Val = attr.Val
|
||||||
|
log.Println(attr.Val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
recurse(c)
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
recurse(n)
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user