tweak outline template rendering
This commit is contained in:
@@ -1,14 +1,17 @@
|
||||
package responsemodifers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"embed"
|
||||
"fmt"
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/atom"
|
||||
"html/template"
|
||||
"io"
|
||||
"ladder/proxychain"
|
||||
"log"
|
||||
|
||||
"github.com/go-shiori/dom"
|
||||
//"github.com/go-shiori/dom"
|
||||
"github.com/markusmobius/go-trafilatura"
|
||||
)
|
||||
|
||||
@@ -32,32 +35,42 @@ func GenerateReadableOutline() proxychain.ResponseModification {
|
||||
// 1. extract dom contents using reading mode algo
|
||||
// ===========================================================
|
||||
opts := trafilatura.Options{
|
||||
IncludeImages: true,
|
||||
IncludeLinks: true,
|
||||
//FavorPrecision: true,
|
||||
IncludeImages: true,
|
||||
IncludeLinks: true,
|
||||
FavorRecall: true,
|
||||
Deduplicate: true,
|
||||
FallbackCandidates: nil, // TODO: https://github.com/markusmobius/go-trafilatura/blob/main/examples/chained/main.go
|
||||
// implement fallbacks from "github.com/markusmobius/go-domdistiller" and "github.com/go-shiori/go-readability"
|
||||
OriginalURL: chain.Request.URL,
|
||||
}
|
||||
|
||||
result, err := trafilatura.Extract(chain.Response.Body, opts)
|
||||
extract, err := trafilatura.Extract(chain.Response.Body, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
doc := trafilatura.CreateReadableDocument(result)
|
||||
distilledHTML := dom.OuterHTML(doc)
|
||||
|
||||
// ============================================================================
|
||||
// 2. render generate_readable_outline.html template using metadata from step 1
|
||||
// ============================================================================
|
||||
|
||||
// render DOM to string without H1 title
|
||||
removeFirstH1(extract.ContentNode)
|
||||
var b bytes.Buffer
|
||||
html.Render(&b, extract.ContentNode)
|
||||
distilledHTML := b.String()
|
||||
|
||||
// populate template parameters
|
||||
data := map[string]interface{}{
|
||||
"Success": true,
|
||||
"Params": chain.Request.URL,
|
||||
//"Title": result.Metadata.Title, // todo: modify CreateReadableDocument so we don't have <h1> titles duplicated?
|
||||
"Date": result.Metadata.Date.String(),
|
||||
"Author": result.Metadata.Author,
|
||||
"Body": distilledHTML,
|
||||
"Success": true,
|
||||
"Footer": extract.Metadata.License,
|
||||
"Image": extract.Metadata.Image,
|
||||
"Description": extract.Metadata.Description,
|
||||
"Hostname": extract.Metadata.Hostname,
|
||||
"Url": chain.Request.URL,
|
||||
"Title": extract.Metadata.Title, // todo: modify CreateReadableDocument so we don't have <h1> titles duplicated?
|
||||
"Date": extract.Metadata.Date.String(),
|
||||
"Author": extract.Metadata.Author,
|
||||
"Body": distilledHTML,
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
@@ -82,3 +95,20 @@ func GenerateReadableOutline() proxychain.ResponseModification {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func removeFirstH1(n *html.Node) {
|
||||
var recurse func(*html.Node) bool
|
||||
recurse = func(n *html.Node) bool {
|
||||
if n.Type == html.ElementNode && n.DataAtom == atom.H1 {
|
||||
return true // Found the first H1, return true to stop
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if recurse(c) {
|
||||
n.RemoveChild(c)
|
||||
return false // Removed first H1, no need to continue
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
recurse(n)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user