add feature to load ruleset from directory or gzip file on http server, refactor ruleset loading logic

This commit is contained in:
Kevin Pham
2023-11-14 15:57:39 -06:00
6 changed files with 137 additions and 45 deletions

View File

@@ -17,7 +17,7 @@ Freedom of information is an essential pillar of democracy and informed decision
### Features ### Features
- [x] Bypass Paywalls - [x] Bypass Paywalls
- [x] Remove CORS headers from responses, assets, and images ... - [x] Remove CORS headers from responses, assets, and images ...
- [x] Apply domain based ruleset/code to modify response - [x] Apply domain based ruleset/code to modify response / requested URL
- [x] Keep site browsable - [x] Keep site browsable
- [x] API - [x] API
- [x] Fetch RAW HTML - [x] Fetch RAW HTML
@@ -115,7 +115,7 @@ http://localhost:8080/ruleset
### Ruleset ### Ruleset
It is possible to apply custom rules to modify the response. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup It is possible to apply custom rules to modify the response or the requested URL. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup
See in [ruleset.yaml](ruleset.yaml) for an example. See in [ruleset.yaml](ruleset.yaml) for an example.
@@ -155,4 +155,18 @@ See in [ruleset.yaml](ruleset.yaml) for an example.
- position: .left-content article # Position where to inject the code into DOM - position: .left-content article # Position where to inject the code into DOM
prepend: | prepend: |
<h2>Subtitle</h2> <h2>Subtitle</h2>
- domain: demo.com
headers:
content-security-policy: script-src 'self';
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
urlMods: # Modify the URL
query:
- key: amp # (this will append ?amp=1 to the URL)
value: 1
domain:
- match: www # regex to match part of domain
replace: amp # (this would modify the domain from www.demo.de to amp.demo.de)
path:
- match: ^ # regex to match part of path
replace: /amp/ # (modify the url from https://www.demo.com/article/ to https://www.demo.de/amp/article/)
``` ```

View File

@@ -80,12 +80,8 @@ func main() {
} }
app.Get("/", handlers.Form) app.Get("/", handlers.Form)
app.Get("ruleset", handlers.Ruleset)
app.Get("raw/*", handlers.Raw) app.Get("raw/*", handlers.Raw)
app.Get("api/*", handlers.Api) app.Get("api/*", handlers.Api)
app.Get("ruleset", handlers.Raw)
app.Get("/*", handlers.ProxySite(*ruleset)) app.Get("/*", handlers.ProxySite(*ruleset))
log.Fatal(app.Listen(":" + *port)) log.Fatal(app.Listen(":" + *port))
} }

View File

@@ -30,6 +30,59 @@ func init() {
} }
} }
// extracts a URL from the request ctx. If the URL in the request
// is a relative path, it reconstructs the full URL using the referer header.
func extractUrl(c *fiber.Ctx) (string, error) {
// try to extract url-encoded
reqUrl, err := url.QueryUnescape(c.Params("*"))
if err != nil {
// fallback
reqUrl = c.Params("*")
}
// Extract the actual path from req ctx
urlQuery, err := url.Parse(reqUrl)
if err != nil {
return "", fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err)
}
isRelativePath := urlQuery.Scheme == ""
// eg: https://localhost:8080/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
if isRelativePath {
// Parse the referer URL from the request header.
refererUrl, err := url.Parse(c.Get("referer"))
if err != nil {
return "", fmt.Errorf("error parsing referer URL from req: '%s': %v", reqUrl, err)
}
// Extract the real url from referer path
realUrl, err := url.Parse(strings.TrimPrefix(refererUrl.Path, "/"))
if err != nil {
return "", fmt.Errorf("error parsing real URL from referer '%s': %v", refererUrl.Path, err)
}
// reconstruct the full URL using the referer's scheme, host, and the relative path / queries
fullUrl := &url.URL{
Scheme: realUrl.Scheme,
Host: realUrl.Host,
Path: urlQuery.Path,
RawQuery: urlQuery.RawQuery,
}
if os.Getenv("LOG_URLS") == "true" {
log.Printf("modified relative URL: '%s' -> '%s'", reqUrl, fullUrl.String())
}
return fullUrl.String(), nil
}
// default behavior:
// eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
return urlQuery.String(), nil
}
func ProxySite(rulesetPath string) fiber.Handler { func ProxySite(rulesetPath string) fiber.Handler {
if rulesetPath != "" { if rulesetPath != "" {
rs, err := ruleset.NewRuleset(rulesetPath) rs, err := ruleset.NewRuleset(rulesetPath)
@@ -41,7 +94,10 @@ func ProxySite(rulesetPath string) fiber.Handler {
return func(c *fiber.Ctx) error { return func(c *fiber.Ctx) error {
// Get the url from the URL // Get the url from the URL
url := c.Params("*") url, err := extractUrl(c)
if err != nil {
log.Println("ERROR In URL extraction:", err)
}
queries := c.Queries() queries := c.Queries()
body, _, resp, err := fetchSite(url, queries) body, _, resp, err := fetchSite(url, queries)
@@ -58,6 +114,42 @@ func ProxySite(rulesetPath string) fiber.Handler {
} }
} }
func modifyURL(uri string, rule ruleset.Rule) (string, error) {
newUrl, err := url.Parse(uri)
if err != nil {
return "", err
}
for _, urlMod := range rule.UrlMods.Domain {
re := regexp.MustCompile(urlMod.Match)
newUrl.Host = re.ReplaceAllString(newUrl.Host, urlMod.Replace)
}
for _, urlMod := range rule.UrlMods.Path {
re := regexp.MustCompile(urlMod.Match)
newUrl.Path = re.ReplaceAllString(newUrl.Path, urlMod.Replace)
}
v := newUrl.Query()
for _, query := range rule.UrlMods.Query {
if query.Value == "" {
v.Del(query.Key)
continue
}
v.Set(query.Key, query.Value)
}
newUrl.RawQuery = v.Encode()
if rule.GoogleCache {
newUrl, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + newUrl.String())
if err != nil {
return "", err
}
}
return newUrl.String(), nil
}
func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) { func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) {
urlQuery := "?" urlQuery := "?"
if len(queries) > 0 { if len(queries) > 0 {
@@ -81,18 +173,16 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request
log.Println(u.String() + urlQuery) log.Println(u.String() + urlQuery)
} }
// Modify the URI according to ruleset
rule := fetchRule(u.Host, u.Path) rule := fetchRule(u.Host, u.Path)
url, err := modifyURL(u.String()+urlQuery, rule)
if rule.GoogleCache { if err != nil {
u, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + u.String()) return "", nil, nil, err
if err != nil {
return "", nil, nil, err
}
} }
// Fetch the site // Fetch the site
client := &http.Client{} client := &http.Client{}
req, _ := http.NewRequest("GET", u.String()+urlQuery, nil) req, _ := http.NewRequest("GET", url, nil)
if rule.Headers.UserAgent != "" { if rule.Headers.UserAgent != "" {
req.Header.Set("User-Agent", rule.Headers.UserAgent) req.Header.Set("User-Agent", rule.Headers.UserAgent)
@@ -132,6 +222,7 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request
} }
if rule.Headers.CSP != "" { if rule.Headers.CSP != "" {
//log.Println(rule.Headers.CSP)
resp.Header.Set("Content-Security-Policy", rule.Headers.CSP) resp.Header.Set("Content-Security-Policy", rule.Headers.CSP)
} }

View File

@@ -1,29 +0,0 @@
package handlers
type Regex struct {
Match string `yaml:"match"`
Replace string `yaml:"replace"`
}
type RuleSet []Rule
type Rule struct {
Domain string `yaml:"domain,omitempty"`
Domains []string `yaml:"domains,omitempty"`
Paths []string `yaml:"paths,omitempty"`
Headers struct {
UserAgent string `yaml:"user-agent,omitempty"`
XForwardedFor string `yaml:"x-forwarded-for,omitempty"`
Referer string `yaml:"referer,omitempty"`
Cookie string `yaml:"cookie,omitempty"`
CSP string `yaml:"content-security-policy,omitempty"`
} `yaml:"headers,omitempty"`
GoogleCache bool `yaml:"googleCache,omitempty"`
RegexRules []Regex `yaml:"regexRules"`
Injections []struct {
Position string `yaml:"position"`
Append string `yaml:"append"`
Prepend string `yaml:"prepend"`
Replace string `yaml:"replace"`
} `yaml:"injections"`
}

View File

@@ -20,6 +20,10 @@ type Regex struct {
Match string `yaml:"match"` Match string `yaml:"match"`
Replace string `yaml:"replace"` Replace string `yaml:"replace"`
} }
type KV struct {
Key string `yaml:"key"`
Value string `yaml:"value"`
}
type RuleSet []Rule type RuleSet []Rule
@@ -36,7 +40,14 @@ type Rule struct {
} `yaml:"headers,omitempty"` } `yaml:"headers,omitempty"`
GoogleCache bool `yaml:"googleCache,omitempty"` GoogleCache bool `yaml:"googleCache,omitempty"`
RegexRules []Regex `yaml:"regexRules"` RegexRules []Regex `yaml:"regexRules"`
Injections []struct {
UrlMods struct {
Domain []Regex `yaml:"domain"`
Path []Regex `yaml:"path"`
Query []KV `yaml:"query"`
} `yaml:"urlMods"`
Injections []struct {
Position string `yaml:"position"` Position string `yaml:"position"`
Append string `yaml:"append"` Append string `yaml:"append"`
Prepend string `yaml:"prepend"` Prepend string `yaml:"prepend"`

View File

@@ -163,6 +163,14 @@
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
content-security-policy: script-src 'self'; content-security-policy: script-src 'self';
cookie: cookie:
- domain: tagesspiegel.de
headers:
content-security-policy: script-src 'self';
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
urlMods:
query:
- key: amp
value: 1
- domain: www.ft.com - domain: www.ft.com
headers: headers:
referer: https://t.co/x?amp=1 referer: https://t.co/x?amp=1
@@ -182,4 +190,5 @@
cookie.forEach(el => { el.remove(); }); cookie.forEach(el => { el.remove(); });
}, 1000); }, 1000);
}) })
</script> </script>