From 140a38fd39213c76d0d803de3635a3732ad94a67 Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Thu, 30 Nov 2023 22:47:15 -0600 Subject: [PATCH] improve /api/content metadata and tag handling --- cmd/main.go | 2 +- handlers/api_content.go | 44 +++++++++++++++++++ .../responsemodifers/api/outline_api.go | 41 ++++++++++++++++- 3 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 handlers/api_content.go diff --git a/cmd/main.go b/cmd/main.go index a7840f0..e6aaf1f 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -183,7 +183,7 @@ func main() { RulesetPath: *ruleset, } - app.Get("api/content/*", handlers.NewAPIOutlineHandler("api/outline/*", proxyOpts)) + app.Get("api/content/*", handlers.NewAPIContentHandler("api/outline/*", proxyOpts)) app.Get("outline/*", handlers.NewOutlineHandler("outline/*", proxyOpts)) diff --git a/handlers/api_content.go b/handlers/api_content.go new file mode 100644 index 0000000..71608e5 --- /dev/null +++ b/handlers/api_content.go @@ -0,0 +1,44 @@ +package handlers + +import ( + "ladder/proxychain" + rx "ladder/proxychain/requestmodifers" + tx "ladder/proxychain/responsemodifers" + + "github.com/gofiber/fiber/v2" +) + +func NewAPIContentHandler(path string, opts *ProxyOptions) fiber.Handler { + // TODO: implement ruleset logic + /* + var rs ruleset.RuleSet + if opts.RulesetPath != "" { + r, err := ruleset.NewRuleset(opts.RulesetPath) + if err != nil { + panic(err) + } + rs = r + } + */ + + return func(c *fiber.Ctx) error { + proxychain := proxychain. + NewProxyChain(). + WithAPIPath(path). + SetDebugLogging(opts.Verbose). + SetRequestModifications( + rx.MasqueradeAsGoogleBot(), + rx.ForwardRequestHeaders(), + rx.SpoofReferrerFromGoogleSearch(), + ). + AddResponseModifications( + tx.DeleteIncomingCookies(), + tx.RewriteHTMLResourceURLs(), + tx.APIContent(), + ). + SetFiberCtx(c). + Execute() + + return proxychain + } +} diff --git a/proxychain/responsemodifers/api/outline_api.go b/proxychain/responsemodifers/api/outline_api.go index d96b05f..f17f42e 100644 --- a/proxychain/responsemodifers/api/outline_api.go +++ b/proxychain/responsemodifers/api/outline_api.go @@ -27,6 +27,15 @@ type TextContent struct { Data string `json:"data"` } +type ListContent struct { + Type string `json:"type"` + ListItems []ListItemContent `json:"listItems"` +} + +type ListItemContent struct { + Data string `json:"data"` +} + type JSONDocument struct { Success bool `json:"success"` Error ErrorDetails `json:"error"` @@ -35,6 +44,7 @@ type JSONDocument struct { Author string `json:"author"` URL string `json:"url"` Hostname string `json:"hostname"` + Image string `json:"image"` Description string `json:"description"` Sitename string `json:"sitename"` Date string `json:"date"` @@ -58,11 +68,13 @@ func ExtractResultToAPIResponse(extract *trafilatura.ExtractResult) *JSONDocumen jsonDoc.Metadata.URL = extract.Metadata.URL jsonDoc.Metadata.Hostname = extract.Metadata.Hostname jsonDoc.Metadata.Description = extract.Metadata.Description + jsonDoc.Metadata.Image = extract.Metadata.Image jsonDoc.Metadata.Sitename = extract.Metadata.Sitename jsonDoc.Metadata.Date = extract.Metadata.Date.Format("2006-01-02") jsonDoc.Metadata.Categories = extract.Metadata.Categories jsonDoc.Metadata.Tags = extract.Metadata.Tags jsonDoc.Metadata.License = extract.Metadata.License + jsonDoc.Metadata.Hostname = extract.Metadata.Hostname // Populate content if extract.ContentNode != nil { @@ -120,7 +132,34 @@ func parseContent(node *html.Node) []interface{} { } content = append(content, text) - // continue with other tags + case "h4": + text := TextContent{ + Type: "h4", + Data: dom.InnerText(child), + } + content = append(content, text) + + case "h5": + text := TextContent{ + Type: "h5", + Data: dom.InnerText(child), + } + content = append(content, text) + + case "ul", "ol": + list := ListContent{ + Type: child.Data, + ListItems: []ListItemContent{}, + } + for listItem := child.FirstChild; listItem != nil; listItem = listItem.NextSibling { + if listItem.Data == "li" { + listItemContent := ListItemContent{ + Data: dom.InnerText(listItem), + } + list.ListItems = append(list.ListItems, listItemContent) + } + } + content = append(content, list) default: text := TextContent{