From cd891d88a81f488e6944326e36bd04c92fb29d59 Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Thu, 30 Nov 2023 19:06:33 -0600 Subject: [PATCH] tweak outline template rendering --- .../generate_readable_outline.go | 58 ++++++++++++++----- .../generate_readable_outline.html | 23 ++++---- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/proxychain/responsemodifers/generate_readable_outline.go b/proxychain/responsemodifers/generate_readable_outline.go index 67dae53..dc621fa 100644 --- a/proxychain/responsemodifers/generate_readable_outline.go +++ b/proxychain/responsemodifers/generate_readable_outline.go @@ -1,14 +1,17 @@ package responsemodifers import ( + "bytes" "embed" "fmt" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" "html/template" "io" "ladder/proxychain" "log" - "github.com/go-shiori/dom" + //"github.com/go-shiori/dom" "github.com/markusmobius/go-trafilatura" ) @@ -32,32 +35,42 @@ func GenerateReadableOutline() proxychain.ResponseModification { // 1. extract dom contents using reading mode algo // =========================================================== opts := trafilatura.Options{ - IncludeImages: true, - IncludeLinks: true, - //FavorPrecision: true, + IncludeImages: true, + IncludeLinks: true, + FavorRecall: true, + Deduplicate: true, FallbackCandidates: nil, // TODO: https://github.com/markusmobius/go-trafilatura/blob/main/examples/chained/main.go // implement fallbacks from "github.com/markusmobius/go-domdistiller" and "github.com/go-shiori/go-readability" OriginalURL: chain.Request.URL, } - result, err := trafilatura.Extract(chain.Response.Body, opts) + extract, err := trafilatura.Extract(chain.Response.Body, opts) if err != nil { return err } - doc := trafilatura.CreateReadableDocument(result) - distilledHTML := dom.OuterHTML(doc) - // ============================================================================ // 2. render generate_readable_outline.html template using metadata from step 1 // ============================================================================ + + // render DOM to string without H1 title + removeFirstH1(extract.ContentNode) + var b bytes.Buffer + html.Render(&b, extract.ContentNode) + distilledHTML := b.String() + + // populate template parameters data := map[string]interface{}{ - "Success": true, - "Params": chain.Request.URL, - //"Title": result.Metadata.Title, // todo: modify CreateReadableDocument so we don't have

titles duplicated? - "Date": result.Metadata.Date.String(), - "Author": result.Metadata.Author, - "Body": distilledHTML, + "Success": true, + "Footer": extract.Metadata.License, + "Image": extract.Metadata.Image, + "Description": extract.Metadata.Description, + "Hostname": extract.Metadata.Hostname, + "Url": chain.Request.URL, + "Title": extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have

titles duplicated? + "Date": extract.Metadata.Date.String(), + "Author": extract.Metadata.Author, + "Body": distilledHTML, } // ============================================================================ @@ -82,3 +95,20 @@ func GenerateReadableOutline() proxychain.ResponseModification { return nil } } + +func removeFirstH1(n *html.Node) { + var recurse func(*html.Node) bool + recurse = func(n *html.Node) bool { + if n.Type == html.ElementNode && n.DataAtom == atom.H1 { + return true // Found the first H1, return true to stop + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if recurse(c) { + n.RemoveChild(c) + return false // Removed first H1, no need to continue + } + } + return false + } + recurse(n) +} diff --git a/proxychain/responsemodifers/generate_readable_outline.html b/proxychain/responsemodifers/generate_readable_outline.html index ff4b034..c849c9c 100644 --- a/proxychain/responsemodifers/generate_readable_outline.html +++ b/proxychain/responsemodifers/generate_readable_outline.html @@ -4,6 +4,7 @@ +