structured article json first draft

This commit is contained in:
Kevin Pham
2023-11-29 00:33:46 -06:00
parent b7a012d75b
commit e5bd5df2c0
3 changed files with 153 additions and 10 deletions

View File

@@ -89,11 +89,13 @@ func masqueradeAsTrustedBot(botUA string, botIP string, ja3 string) proxychain.R
DeleteRequestHeader("origin"),
)
/*
if ja3 != "" {
chain.AddOnceRequestModifications(
SpoofJA3fingerprint(ja3, botUA),
)
}
*/
return nil
}

View File

@@ -8,7 +8,7 @@ import (
// from enforcing any CORS restrictions. This should run at the end of the chain.
func BypassCORS() proxychain.ResponseModification {
return func(chain *proxychain.ProxyChain) error {
chain.AddResponseModifications(
chain.AddOnceResponseModifications(
SetResponseHeader("Access-Control-Allow-Origin", "*"),
SetResponseHeader("Access-Control-Expose-Headers", "*"),
SetResponseHeader("Access-Control-Allow-Credentials", "true"),

View File

@@ -1,11 +1,14 @@
package responsemodifers
import (
"bytes"
"encoding/json"
"github.com/go-shiori/dom"
"github.com/markusmobius/go-trafilatura"
"golang.org/x/net/html"
"io"
"ladder/proxychain"
"strings"
//"strings"
)
// Outline creates an JSON representation of the article
@@ -26,10 +29,148 @@ func Outline() proxychain.ResponseModification {
return err
}
doc := trafilatura.CreateReadableDocument(result)
reader := io.NopCloser(strings.NewReader(dom.OuterHTML(doc)))
chain.Response.Body = reader
doc := createJSONDocument(result)
jsonData, err := json.MarshalIndent(doc, "", " ")
if err != nil {
return err
}
buf := bytes.NewBuffer(jsonData)
//doc := trafilatura.CreateReadableDocument(result)
//reader := io.NopCloser(strings.NewReader(dom.OuterHTML(doc)))
chain.Response.Body = io.NopCloser(buf)
return nil
}
}
// =======================================================================================
// credit @joncrangle https://github.com/everywall/ladder/issues/38#issuecomment-1831252934
type ImageContent struct {
Type string `json:"type"`
URL string `json:"url"`
Alt string `json:"alt"`
Caption string `json:"caption"`
}
type LinkContent struct {
Type string `json:"type"`
Href string `json:"href"`
Data string `json:"data"`
}
type TextContent struct {
Type string `json:"type"`
Data string `json:"data"`
}
type JSONDocument struct {
Success bool `json:"success"`
Error struct {
Message string `json:"message"`
Type string `json:"type"`
Cause string `json:"cause"`
} `json:"error"`
Metadata struct {
Title string `json:"title"`
Author string `json:"author"`
URL string `json:"url"`
Hostname string `json:"hostname"`
Description string `json:"description"`
Sitename string `json:"sitename"`
Date string `json:"date"`
Categories []string `json:"categories"`
Tags []string `json:"tags"`
License string `json:"license"`
} `json:"metadata"`
Content []interface{} `json:"content"`
Comments string `json:"comments"`
}
func createJSONDocument(extract *trafilatura.ExtractResult) *JSONDocument {
jsonDoc := &JSONDocument{}
// Populate success
jsonDoc.Success = true
// Populate metadata
jsonDoc.Metadata.Title = extract.Metadata.Title
jsonDoc.Metadata.Author = extract.Metadata.Author
jsonDoc.Metadata.URL = extract.Metadata.URL
jsonDoc.Metadata.Hostname = extract.Metadata.Hostname
jsonDoc.Metadata.Description = extract.Metadata.Description
jsonDoc.Metadata.Sitename = extract.Metadata.Sitename
jsonDoc.Metadata.Date = extract.Metadata.Date.Format("2006-01-02")
jsonDoc.Metadata.Categories = extract.Metadata.Categories
jsonDoc.Metadata.Tags = extract.Metadata.Tags
jsonDoc.Metadata.License = extract.Metadata.License
// Populate content
if extract.ContentNode != nil {
jsonDoc.Content = parseContent(extract.ContentNode)
}
// Populate comments
if extract.CommentsNode != nil {
jsonDoc.Comments = dom.OuterHTML(extract.CommentsNode)
}
return jsonDoc
}
func parseContent(node *html.Node) []interface{} {
var content []interface{}
for child := node.FirstChild; child != nil; child = child.NextSibling {
switch child.Data {
case "img":
image := ImageContent{
Type: "img",
URL: dom.GetAttribute(child, "src"),
Alt: dom.GetAttribute(child, "alt"),
Caption: dom.GetAttribute(child, "caption"),
}
content = append(content, image)
case "a":
link := LinkContent{
Type: "a",
Href: dom.GetAttribute(child, "href"),
Data: dom.InnerText(child),
}
content = append(content, link)
case "h1":
text := TextContent{
Type: "h1",
Data: dom.InnerText(child),
}
content = append(content, text)
case "h2":
text := TextContent{
Type: "h2",
Data: dom.InnerText(child),
}
content = append(content, text)
case "h3":
text := TextContent{
Type: "h3",
Data: dom.InnerText(child),
}
content = append(content, text)
// continue with other tags
default:
text := TextContent{
Type: "p",
Data: dom.InnerText(child),
}
content = append(content, text)
}
}
return content
}