diff --git a/README.md b/README.md index 48c8783..7d5e14a 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Freedom of information is an essential pillar of democracy and informed decision ### Features - [x] Bypass Paywalls - [x] Remove CORS headers from responses, assets, and images ... -- [x] Apply domain based ruleset/code to modify response +- [x] Apply domain based ruleset/code to modify response / requested URL - [x] Keep site browsable - [x] API - [x] Fetch RAW HTML @@ -115,7 +115,7 @@ http://localhost:8080/ruleset ### Ruleset -It is possible to apply custom rules to modify the response. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup +It is possible to apply custom rules to modify the response or the requested URL. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup See in [ruleset.yaml](ruleset.yaml) for an example. @@ -155,4 +155,18 @@ See in [ruleset.yaml](ruleset.yaml) for an example. - position: .left-content article # Position where to inject the code into DOM prepend: |

Subtitle

+- domain: demo.com + headers: + content-security-policy: script-src 'self'; + user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 + urlMods: # Modify the URL + query: + - key: amp # (this will append ?amp=1 to the URL) + value: 1 + domain: + - match: www # regex to match part of domain + replace: amp # (this would modify the domain from www.demo.de to amp.demo.de) + path: + - match: ^ # regex to match part of path + replace: /amp/ # (modify the url from https://www.demo.com/article/ to https://www.demo.de/amp/article/) ``` diff --git a/cmd/main.go b/cmd/main.go index 256c52e..54e7a86 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -80,12 +80,8 @@ func main() { } app.Get("/", handlers.Form) - app.Get("ruleset", handlers.Ruleset) - app.Get("raw/*", handlers.Raw) app.Get("api/*", handlers.Api) - app.Get("ruleset", handlers.Raw) app.Get("/*", handlers.ProxySite(*ruleset)) - log.Fatal(app.Listen(":" + *port)) } diff --git a/handlers/proxy.go b/handlers/proxy.go index 18a6f41..10106f1 100644 --- a/handlers/proxy.go +++ b/handlers/proxy.go @@ -30,6 +30,59 @@ func init() { } } +// extracts a URL from the request ctx. If the URL in the request +// is a relative path, it reconstructs the full URL using the referer header. +func extractUrl(c *fiber.Ctx) (string, error) { + // try to extract url-encoded + reqUrl, err := url.QueryUnescape(c.Params("*")) + if err != nil { + // fallback + reqUrl = c.Params("*") + } + + // Extract the actual path from req ctx + urlQuery, err := url.Parse(reqUrl) + if err != nil { + return "", fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err) + } + + isRelativePath := urlQuery.Scheme == "" + + // eg: https://localhost:8080/images/foobar.jpg -> https://realsite.com/images/foobar.jpg + if isRelativePath { + // Parse the referer URL from the request header. + refererUrl, err := url.Parse(c.Get("referer")) + if err != nil { + return "", fmt.Errorf("error parsing referer URL from req: '%s': %v", reqUrl, err) + } + + // Extract the real url from referer path + realUrl, err := url.Parse(strings.TrimPrefix(refererUrl.Path, "/")) + if err != nil { + return "", fmt.Errorf("error parsing real URL from referer '%s': %v", refererUrl.Path, err) + } + + // reconstruct the full URL using the referer's scheme, host, and the relative path / queries + fullUrl := &url.URL{ + Scheme: realUrl.Scheme, + Host: realUrl.Host, + Path: urlQuery.Path, + RawQuery: urlQuery.RawQuery, + } + + if os.Getenv("LOG_URLS") == "true" { + log.Printf("modified relative URL: '%s' -> '%s'", reqUrl, fullUrl.String()) + } + return fullUrl.String(), nil + + } + + // default behavior: + // eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg + return urlQuery.String(), nil + +} + func ProxySite(rulesetPath string) fiber.Handler { if rulesetPath != "" { rs, err := ruleset.NewRuleset(rulesetPath) @@ -41,7 +94,10 @@ func ProxySite(rulesetPath string) fiber.Handler { return func(c *fiber.Ctx) error { // Get the url from the URL - url := c.Params("*") + url, err := extractUrl(c) + if err != nil { + log.Println("ERROR In URL extraction:", err) + } queries := c.Queries() body, _, resp, err := fetchSite(url, queries) @@ -58,6 +114,42 @@ func ProxySite(rulesetPath string) fiber.Handler { } } +func modifyURL(uri string, rule ruleset.Rule) (string, error) { + newUrl, err := url.Parse(uri) + if err != nil { + return "", err + } + + for _, urlMod := range rule.UrlMods.Domain { + re := regexp.MustCompile(urlMod.Match) + newUrl.Host = re.ReplaceAllString(newUrl.Host, urlMod.Replace) + } + + for _, urlMod := range rule.UrlMods.Path { + re := regexp.MustCompile(urlMod.Match) + newUrl.Path = re.ReplaceAllString(newUrl.Path, urlMod.Replace) + } + + v := newUrl.Query() + for _, query := range rule.UrlMods.Query { + if query.Value == "" { + v.Del(query.Key) + continue + } + v.Set(query.Key, query.Value) + } + newUrl.RawQuery = v.Encode() + + if rule.GoogleCache { + newUrl, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + newUrl.String()) + if err != nil { + return "", err + } + } + + return newUrl.String(), nil +} + func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) { urlQuery := "?" if len(queries) > 0 { @@ -81,18 +173,16 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request log.Println(u.String() + urlQuery) } + // Modify the URI according to ruleset rule := fetchRule(u.Host, u.Path) - - if rule.GoogleCache { - u, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + u.String()) - if err != nil { - return "", nil, nil, err - } + url, err := modifyURL(u.String()+urlQuery, rule) + if err != nil { + return "", nil, nil, err } // Fetch the site client := &http.Client{} - req, _ := http.NewRequest("GET", u.String()+urlQuery, nil) + req, _ := http.NewRequest("GET", url, nil) if rule.Headers.UserAgent != "" { req.Header.Set("User-Agent", rule.Headers.UserAgent) @@ -132,6 +222,7 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request } if rule.Headers.CSP != "" { + //log.Println(rule.Headers.CSP) resp.Header.Set("Content-Security-Policy", rule.Headers.CSP) } diff --git a/handlers/types.go b/handlers/types.go deleted file mode 100644 index 16e4c49..0000000 --- a/handlers/types.go +++ /dev/null @@ -1,29 +0,0 @@ -package handlers - -type Regex struct { - Match string `yaml:"match"` - Replace string `yaml:"replace"` -} - -type RuleSet []Rule - -type Rule struct { - Domain string `yaml:"domain,omitempty"` - Domains []string `yaml:"domains,omitempty"` - Paths []string `yaml:"paths,omitempty"` - Headers struct { - UserAgent string `yaml:"user-agent,omitempty"` - XForwardedFor string `yaml:"x-forwarded-for,omitempty"` - Referer string `yaml:"referer,omitempty"` - Cookie string `yaml:"cookie,omitempty"` - CSP string `yaml:"content-security-policy,omitempty"` - } `yaml:"headers,omitempty"` - GoogleCache bool `yaml:"googleCache,omitempty"` - RegexRules []Regex `yaml:"regexRules"` - Injections []struct { - Position string `yaml:"position"` - Append string `yaml:"append"` - Prepend string `yaml:"prepend"` - Replace string `yaml:"replace"` - } `yaml:"injections"` -} diff --git a/pkg/ruleset/ruleset.go b/pkg/ruleset/ruleset.go index a4efd3d..9029a65 100644 --- a/pkg/ruleset/ruleset.go +++ b/pkg/ruleset/ruleset.go @@ -20,6 +20,10 @@ type Regex struct { Match string `yaml:"match"` Replace string `yaml:"replace"` } +type KV struct { + Key string `yaml:"key"` + Value string `yaml:"value"` +} type RuleSet []Rule @@ -36,7 +40,14 @@ type Rule struct { } `yaml:"headers,omitempty"` GoogleCache bool `yaml:"googleCache,omitempty"` RegexRules []Regex `yaml:"regexRules"` - Injections []struct { + + UrlMods struct { + Domain []Regex `yaml:"domain"` + Path []Regex `yaml:"path"` + Query []KV `yaml:"query"` + } `yaml:"urlMods"` + + Injections []struct { Position string `yaml:"position"` Append string `yaml:"append"` Prepend string `yaml:"prepend"` diff --git a/ruleset.yaml b/ruleset.yaml index 2953736..572b980 100644 --- a/ruleset.yaml +++ b/ruleset.yaml @@ -163,6 +163,14 @@ user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 content-security-policy: script-src 'self'; cookie: +- domain: tagesspiegel.de + headers: + content-security-policy: script-src 'self'; + user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 + urlMods: + query: + - key: amp + value: 1 - domain: www.ft.com headers: referer: https://t.co/x?amp=1 @@ -182,4 +190,5 @@ cookie.forEach(el => { el.remove(); }); }, 1000); }) - \ No newline at end of file + +