From 082868af2db76430e2b625dc7f5db41fd52fa3d2 Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Sun, 12 Nov 2023 10:30:06 -0600 Subject: [PATCH 1/4] Add feature to modify URLs in ruleset --- README.md | 18 ++++++++++++++++-- handlers/proxy.go | 48 ++++++++++++++++++++++++++++++++++++++++------- handlers/types.go | 13 ++++++++++++- ruleset.yaml | 10 +++++++++- 4 files changed, 78 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ca348f5..5e13b87 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Freedom of information is an essential pillar of democracy and informed decision ### Features - [x] Bypass Paywalls - [x] Remove CORS headers from responses, assets, and images ... -- [x] Apply domain based ruleset/code to modify response +- [x] Apply domain based ruleset/code to modify response / requested URL - [x] Keep site browsable - [x] API - [x] Fetch RAW HTML @@ -115,7 +115,7 @@ http://localhost:8080/ruleset ### Ruleset -It is possible to apply custom rules to modify the response. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup +It is possible to apply custom rules to modify the response or the requested URL. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup See in [ruleset.yaml](ruleset.yaml) for an example. @@ -155,4 +155,18 @@ See in [ruleset.yaml](ruleset.yaml) for an example. - position: .left-content article # Position where to inject the code into DOM prepend: |

Suptitle

+- domain: tagesspiegel.de + headers: + content-security-policy: script-src 'self'; + user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 + urlMods: # Modify the URL + query: + - key: amp # (this will append ?amp=1 to the URL) + value: 1 + domain: + - match: www # regex to match part of domain + replace: amp # (this would modify the domain from www.tagesspiegel.de to amp.tagesspielgel.de) + path: + - match: ^ # regex to match part of path + replace: /amp/ # (modify the url from https://www.tagesspiegel.de/internationales/ to https://www.tagesspiegel.de/amp/internationales/) ``` diff --git a/handlers/proxy.go b/handlers/proxy.go index 2f89037..e9c3859 100644 --- a/handlers/proxy.go +++ b/handlers/proxy.go @@ -40,6 +40,41 @@ func ProxySite(c *fiber.Ctx) error { return c.SendString(body) } +func modifyURL(uri string, rule Rule) (string, error) { + newUrl, err := url.Parse(uri) + if err != nil { + return "", err + } + + for _, urlMod := range rule.UrlMods.Domain { + re := regexp.MustCompile(urlMod.Match) + newUrl.Host = re.ReplaceAllString(newUrl.Host, urlMod.Replace) + } + + for _, urlMod := range rule.UrlMods.Path { + re := regexp.MustCompile(urlMod.Match) + newUrl.Path = re.ReplaceAllString(newUrl.Path, urlMod.Replace) + } + + v := newUrl.Query() + for _, query := range rule.UrlMods.Query { + if query.Value == "" { + v.Del(query.Key) + continue + } + v.Set(query.Key, query.Value) + } + newUrl.RawQuery = v.Encode() + + if rule.GoogleCache { + newUrl, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + newUrl.String()) + if err != nil { + return "", err + } + } + return newUrl.String(), nil +} + func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) { urlQuery := "?" if len(queries) > 0 { @@ -63,18 +98,17 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request log.Println(u.String() + urlQuery) } + // Modify the URI according to ruleset rule := fetchRule(u.Host, u.Path) - - if rule.GoogleCache { - u, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + u.String()) - if err != nil { - return "", nil, nil, err - } + url, err := modifyURL(u.String()+urlQuery, rule) + if err != nil { + return "", nil, nil, err } + log.Println("fetch URI: %", url) // Fetch the site client := &http.Client{} - req, _ := http.NewRequest("GET", u.String()+urlQuery, nil) + req, _ := http.NewRequest("GET", url, nil) if rule.Headers.UserAgent != "" { req.Header.Set("User-Agent", rule.Headers.UserAgent) diff --git a/handlers/types.go b/handlers/types.go index 16e4c49..6c02086 100644 --- a/handlers/types.go +++ b/handlers/types.go @@ -4,6 +4,10 @@ type Regex struct { Match string `yaml:"match"` Replace string `yaml:"replace"` } +type KV struct { + Key string `yaml:"key"` + Value string `yaml:"value"` +} type RuleSet []Rule @@ -20,7 +24,14 @@ type Rule struct { } `yaml:"headers,omitempty"` GoogleCache bool `yaml:"googleCache,omitempty"` RegexRules []Regex `yaml:"regexRules"` - Injections []struct { + + UrlMods struct { + Domain []Regex `yaml:"domain"` + Path []Regex `yaml:"path"` + Query []KV `yaml:"query"` + } `yaml:"urlMods"` + + Injections []struct { Position string `yaml:"position"` Append string `yaml:"append"` Prepend string `yaml:"prepend"` diff --git a/ruleset.yaml b/ruleset.yaml index d1c1e9f..834b606 100644 --- a/ruleset.yaml +++ b/ruleset.yaml @@ -162,4 +162,12 @@ x-forwarded-for: none user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 content-security-policy: script-src 'self'; - cookie: \ No newline at end of file + cookie: +- domain: tagesspiegel.de + headers: + content-security-policy: script-src 'self'; + user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 + urlMods: + query: + - key: amp + value: 1 From fbc9567820be0f24cd9f11200debe8bc7c3018f2 Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Sun, 12 Nov 2023 11:48:47 -0600 Subject: [PATCH 2/4] Handle relative URLs when using proxy --- handlers/proxy.go | 50 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/handlers/proxy.go b/handlers/proxy.go index e9c3859..26505e8 100644 --- a/handlers/proxy.go +++ b/handlers/proxy.go @@ -22,9 +22,56 @@ var ( allowedDomains = strings.Split(os.Getenv("ALLOWED_DOMAINS"), ",") ) +// extracts a URL from the request ctx. If the URL in the request +// is a relative path, it reconstructs the full URL using the referer header. +func extractUrl(c *fiber.Ctx) (string, error) { + reqUrl := c.Params("*") + + // Extract the actual path from req ctx + urlQuery, err := url.Parse(reqUrl) + if err != nil { + return "", fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err) + } + + isRelativePath := urlQuery.Scheme == "" + + // eg: https://localhost:8080/images/foobar.jpg -> https://realsite.com/images/foobar.jpg + if isRelativePath { + // Parse the referer URL from the request header. + refererUrl, err := url.Parse(c.Get("referer")) + if err != nil { + return "", fmt.Errorf("error parsing referer URL from req: '%s': %v", reqUrl, err) + } + + // Extract the real url from referer path + realUrl, err := url.Parse(strings.TrimPrefix(refererUrl.Path, "/")) + if err != nil { + return "", fmt.Errorf("error parsing real URL from referer '%s': %v", refererUrl.Path, err) + } + + // reconstruct the full URL using the referer's scheme, host, and the relative path / queries + fullUrl := &url.URL{ + Scheme: realUrl.Scheme, + Host: realUrl.Host, + Path: urlQuery.Path, + RawQuery: urlQuery.RawQuery, + } + log.Println(fullUrl.String()) + return fullUrl.String(), nil + } + + // default behavior: + // eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg + return urlQuery.String(), nil + +} + func ProxySite(c *fiber.Ctx) error { // Get the url from the URL - url := c.Params("*") + url, err := extractUrl(c) + if err != nil { + log.Println("ERROR In URL extraction:", err) + } queries := c.Queries() body, _, resp, err := fetchSite(url, queries) @@ -104,7 +151,6 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request if err != nil { return "", nil, nil, err } - log.Println("fetch URI: %", url) // Fetch the site client := &http.Client{} From c8b94dc702249e0cc33fd7b22a56fcefb9aa8b28 Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Sun, 12 Nov 2023 11:52:43 -0600 Subject: [PATCH 3/4] remove debug logging messages --- handlers/proxy.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/handlers/proxy.go b/handlers/proxy.go index 26505e8..d58a635 100644 --- a/handlers/proxy.go +++ b/handlers/proxy.go @@ -56,8 +56,12 @@ func extractUrl(c *fiber.Ctx) (string, error) { Path: urlQuery.Path, RawQuery: urlQuery.RawQuery, } - log.Println(fullUrl.String()) + + if os.Getenv("LOG_URLS") == "true" { + log.Printf("modified relative URL: '%s' -> '%s'", reqUrl, fullUrl.String()) + } return fullUrl.String(), nil + } // default behavior: @@ -119,6 +123,7 @@ func modifyURL(uri string, rule Rule) (string, error) { return "", err } } + return newUrl.String(), nil } From 30a6ab501d2e76096b0aada6bc53ad3e9856d891 Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Sun, 12 Nov 2023 17:02:32 -0600 Subject: [PATCH 4/4] handle URL encoded URLs in proxy for other app integrations --- handlers/proxy.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/handlers/proxy.go b/handlers/proxy.go index d58a635..886431a 100644 --- a/handlers/proxy.go +++ b/handlers/proxy.go @@ -25,7 +25,12 @@ var ( // extracts a URL from the request ctx. If the URL in the request // is a relative path, it reconstructs the full URL using the referer header. func extractUrl(c *fiber.Ctx) (string, error) { - reqUrl := c.Params("*") + // try to extract url-encoded + reqUrl, err := url.QueryUnescape(c.Params("*")) + if err != nil { + // fallback + reqUrl = c.Params("*") + } // Extract the actual path from req ctx urlQuery, err := url.Parse(reqUrl) @@ -199,6 +204,7 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request } if rule.Headers.CSP != "" { + log.Println(rule.Headers.CSP) resp.Header.Set("Content-Security-Policy", rule.Headers.CSP) }