improve /api/content metadata and tag handling

This commit is contained in:
Kevin Pham
2023-11-30 22:47:15 -06:00
parent ef50c81aa6
commit 140a38fd39
3 changed files with 85 additions and 2 deletions

View File

@@ -183,7 +183,7 @@ func main() {
RulesetPath: *ruleset, RulesetPath: *ruleset,
} }
app.Get("api/content/*", handlers.NewAPIOutlineHandler("api/outline/*", proxyOpts)) app.Get("api/content/*", handlers.NewAPIContentHandler("api/outline/*", proxyOpts))
app.Get("outline/*", handlers.NewOutlineHandler("outline/*", proxyOpts)) app.Get("outline/*", handlers.NewOutlineHandler("outline/*", proxyOpts))

44
handlers/api_content.go Normal file
View File

@@ -0,0 +1,44 @@
package handlers
import (
"ladder/proxychain"
rx "ladder/proxychain/requestmodifers"
tx "ladder/proxychain/responsemodifers"
"github.com/gofiber/fiber/v2"
)
func NewAPIContentHandler(path string, opts *ProxyOptions) fiber.Handler {
// TODO: implement ruleset logic
/*
var rs ruleset.RuleSet
if opts.RulesetPath != "" {
r, err := ruleset.NewRuleset(opts.RulesetPath)
if err != nil {
panic(err)
}
rs = r
}
*/
return func(c *fiber.Ctx) error {
proxychain := proxychain.
NewProxyChain().
WithAPIPath(path).
SetDebugLogging(opts.Verbose).
SetRequestModifications(
rx.MasqueradeAsGoogleBot(),
rx.ForwardRequestHeaders(),
rx.SpoofReferrerFromGoogleSearch(),
).
AddResponseModifications(
tx.DeleteIncomingCookies(),
tx.RewriteHTMLResourceURLs(),
tx.APIContent(),
).
SetFiberCtx(c).
Execute()
return proxychain
}
}

View File

@@ -27,6 +27,15 @@ type TextContent struct {
Data string `json:"data"` Data string `json:"data"`
} }
type ListContent struct {
Type string `json:"type"`
ListItems []ListItemContent `json:"listItems"`
}
type ListItemContent struct {
Data string `json:"data"`
}
type JSONDocument struct { type JSONDocument struct {
Success bool `json:"success"` Success bool `json:"success"`
Error ErrorDetails `json:"error"` Error ErrorDetails `json:"error"`
@@ -35,6 +44,7 @@ type JSONDocument struct {
Author string `json:"author"` Author string `json:"author"`
URL string `json:"url"` URL string `json:"url"`
Hostname string `json:"hostname"` Hostname string `json:"hostname"`
Image string `json:"image"`
Description string `json:"description"` Description string `json:"description"`
Sitename string `json:"sitename"` Sitename string `json:"sitename"`
Date string `json:"date"` Date string `json:"date"`
@@ -58,11 +68,13 @@ func ExtractResultToAPIResponse(extract *trafilatura.ExtractResult) *JSONDocumen
jsonDoc.Metadata.URL = extract.Metadata.URL jsonDoc.Metadata.URL = extract.Metadata.URL
jsonDoc.Metadata.Hostname = extract.Metadata.Hostname jsonDoc.Metadata.Hostname = extract.Metadata.Hostname
jsonDoc.Metadata.Description = extract.Metadata.Description jsonDoc.Metadata.Description = extract.Metadata.Description
jsonDoc.Metadata.Image = extract.Metadata.Image
jsonDoc.Metadata.Sitename = extract.Metadata.Sitename jsonDoc.Metadata.Sitename = extract.Metadata.Sitename
jsonDoc.Metadata.Date = extract.Metadata.Date.Format("2006-01-02") jsonDoc.Metadata.Date = extract.Metadata.Date.Format("2006-01-02")
jsonDoc.Metadata.Categories = extract.Metadata.Categories jsonDoc.Metadata.Categories = extract.Metadata.Categories
jsonDoc.Metadata.Tags = extract.Metadata.Tags jsonDoc.Metadata.Tags = extract.Metadata.Tags
jsonDoc.Metadata.License = extract.Metadata.License jsonDoc.Metadata.License = extract.Metadata.License
jsonDoc.Metadata.Hostname = extract.Metadata.Hostname
// Populate content // Populate content
if extract.ContentNode != nil { if extract.ContentNode != nil {
@@ -120,7 +132,34 @@ func parseContent(node *html.Node) []interface{} {
} }
content = append(content, text) content = append(content, text)
// continue with other tags case "h4":
text := TextContent{
Type: "h4",
Data: dom.InnerText(child),
}
content = append(content, text)
case "h5":
text := TextContent{
Type: "h5",
Data: dom.InnerText(child),
}
content = append(content, text)
case "ul", "ol":
list := ListContent{
Type: child.Data,
ListItems: []ListItemContent{},
}
for listItem := child.FirstChild; listItem != nil; listItem = listItem.NextSibling {
if listItem.Data == "li" {
listItemContent := ListItemContent{
Data: dom.InnerText(listItem),
}
list.ListItems = append(list.ListItems, listItemContent)
}
}
content = append(content, list)
default: default:
text := TextContent{ text := TextContent{