diff --git a/README.md b/README.md index ca348f5..5e13b87 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Freedom of information is an essential pillar of democracy and informed decision ### Features - [x] Bypass Paywalls - [x] Remove CORS headers from responses, assets, and images ... -- [x] Apply domain based ruleset/code to modify response +- [x] Apply domain based ruleset/code to modify response / requested URL - [x] Keep site browsable - [x] API - [x] Fetch RAW HTML @@ -115,7 +115,7 @@ http://localhost:8080/ruleset ### Ruleset -It is possible to apply custom rules to modify the response. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup +It is possible to apply custom rules to modify the response or the requested URL. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup See in [ruleset.yaml](ruleset.yaml) for an example. @@ -155,4 +155,18 @@ See in [ruleset.yaml](ruleset.yaml) for an example. - position: .left-content article # Position where to inject the code into DOM prepend: |

Suptitle

+- domain: tagesspiegel.de + headers: + content-security-policy: script-src 'self'; + user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 + urlMods: # Modify the URL + query: + - key: amp # (this will append ?amp=1 to the URL) + value: 1 + domain: + - match: www # regex to match part of domain + replace: amp # (this would modify the domain from www.tagesspiegel.de to amp.tagesspielgel.de) + path: + - match: ^ # regex to match part of path + replace: /amp/ # (modify the url from https://www.tagesspiegel.de/internationales/ to https://www.tagesspiegel.de/amp/internationales/) ``` diff --git a/handlers/proxy.go b/handlers/proxy.go index 2f89037..e9c3859 100644 --- a/handlers/proxy.go +++ b/handlers/proxy.go @@ -40,6 +40,41 @@ func ProxySite(c *fiber.Ctx) error { return c.SendString(body) } +func modifyURL(uri string, rule Rule) (string, error) { + newUrl, err := url.Parse(uri) + if err != nil { + return "", err + } + + for _, urlMod := range rule.UrlMods.Domain { + re := regexp.MustCompile(urlMod.Match) + newUrl.Host = re.ReplaceAllString(newUrl.Host, urlMod.Replace) + } + + for _, urlMod := range rule.UrlMods.Path { + re := regexp.MustCompile(urlMod.Match) + newUrl.Path = re.ReplaceAllString(newUrl.Path, urlMod.Replace) + } + + v := newUrl.Query() + for _, query := range rule.UrlMods.Query { + if query.Value == "" { + v.Del(query.Key) + continue + } + v.Set(query.Key, query.Value) + } + newUrl.RawQuery = v.Encode() + + if rule.GoogleCache { + newUrl, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + newUrl.String()) + if err != nil { + return "", err + } + } + return newUrl.String(), nil +} + func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) { urlQuery := "?" if len(queries) > 0 { @@ -63,18 +98,17 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request log.Println(u.String() + urlQuery) } + // Modify the URI according to ruleset rule := fetchRule(u.Host, u.Path) - - if rule.GoogleCache { - u, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + u.String()) - if err != nil { - return "", nil, nil, err - } + url, err := modifyURL(u.String()+urlQuery, rule) + if err != nil { + return "", nil, nil, err } + log.Println("fetch URI: %", url) // Fetch the site client := &http.Client{} - req, _ := http.NewRequest("GET", u.String()+urlQuery, nil) + req, _ := http.NewRequest("GET", url, nil) if rule.Headers.UserAgent != "" { req.Header.Set("User-Agent", rule.Headers.UserAgent) diff --git a/handlers/types.go b/handlers/types.go index 16e4c49..6c02086 100644 --- a/handlers/types.go +++ b/handlers/types.go @@ -4,6 +4,10 @@ type Regex struct { Match string `yaml:"match"` Replace string `yaml:"replace"` } +type KV struct { + Key string `yaml:"key"` + Value string `yaml:"value"` +} type RuleSet []Rule @@ -20,7 +24,14 @@ type Rule struct { } `yaml:"headers,omitempty"` GoogleCache bool `yaml:"googleCache,omitempty"` RegexRules []Regex `yaml:"regexRules"` - Injections []struct { + + UrlMods struct { + Domain []Regex `yaml:"domain"` + Path []Regex `yaml:"path"` + Query []KV `yaml:"query"` + } `yaml:"urlMods"` + + Injections []struct { Position string `yaml:"position"` Append string `yaml:"append"` Prepend string `yaml:"prepend"` diff --git a/ruleset.yaml b/ruleset.yaml index d1c1e9f..834b606 100644 --- a/ruleset.yaml +++ b/ruleset.yaml @@ -162,4 +162,12 @@ x-forwarded-for: none user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 content-security-policy: script-src 'self'; - cookie: \ No newline at end of file + cookie: +- domain: tagesspiegel.de + headers: + content-security-policy: script-src 'self'; + user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 + urlMods: + query: + - key: amp + value: 1