diff --git a/README.md b/README.md
index 48c8783..7d5e14a 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Freedom of information is an essential pillar of democracy and informed decision
### Features
- [x] Bypass Paywalls
- [x] Remove CORS headers from responses, assets, and images ...
-- [x] Apply domain based ruleset/code to modify response
+- [x] Apply domain based ruleset/code to modify response / requested URL
- [x] Keep site browsable
- [x] API
- [x] Fetch RAW HTML
@@ -115,7 +115,7 @@ http://localhost:8080/ruleset
### Ruleset
-It is possible to apply custom rules to modify the response. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup
+It is possible to apply custom rules to modify the response or the requested URL. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup
See in [ruleset.yaml](ruleset.yaml) for an example.
@@ -155,4 +155,18 @@ See in [ruleset.yaml](ruleset.yaml) for an example.
- position: .left-content article # Position where to inject the code into DOM
prepend: |
Subtitle
+- domain: demo.com
+ headers:
+ content-security-policy: script-src 'self';
+ user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
+ urlMods: # Modify the URL
+ query:
+ - key: amp # (this will append ?amp=1 to the URL)
+ value: 1
+ domain:
+ - match: www # regex to match part of domain
+ replace: amp # (this would modify the domain from www.demo.de to amp.demo.de)
+ path:
+ - match: ^ # regex to match part of path
+ replace: /amp/ # (modify the url from https://www.demo.com/article/ to https://www.demo.de/amp/article/)
```
diff --git a/cmd/main.go b/cmd/main.go
index 256c52e..54e7a86 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -80,12 +80,8 @@ func main() {
}
app.Get("/", handlers.Form)
- app.Get("ruleset", handlers.Ruleset)
-
app.Get("raw/*", handlers.Raw)
app.Get("api/*", handlers.Api)
- app.Get("ruleset", handlers.Raw)
app.Get("/*", handlers.ProxySite(*ruleset))
-
log.Fatal(app.Listen(":" + *port))
}
diff --git a/handlers/proxy.go b/handlers/proxy.go
index 18a6f41..10106f1 100644
--- a/handlers/proxy.go
+++ b/handlers/proxy.go
@@ -30,6 +30,59 @@ func init() {
}
}
+// extracts a URL from the request ctx. If the URL in the request
+// is a relative path, it reconstructs the full URL using the referer header.
+func extractUrl(c *fiber.Ctx) (string, error) {
+ // try to extract url-encoded
+ reqUrl, err := url.QueryUnescape(c.Params("*"))
+ if err != nil {
+ // fallback
+ reqUrl = c.Params("*")
+ }
+
+ // Extract the actual path from req ctx
+ urlQuery, err := url.Parse(reqUrl)
+ if err != nil {
+ return "", fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err)
+ }
+
+ isRelativePath := urlQuery.Scheme == ""
+
+ // eg: https://localhost:8080/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
+ if isRelativePath {
+ // Parse the referer URL from the request header.
+ refererUrl, err := url.Parse(c.Get("referer"))
+ if err != nil {
+ return "", fmt.Errorf("error parsing referer URL from req: '%s': %v", reqUrl, err)
+ }
+
+ // Extract the real url from referer path
+ realUrl, err := url.Parse(strings.TrimPrefix(refererUrl.Path, "/"))
+ if err != nil {
+ return "", fmt.Errorf("error parsing real URL from referer '%s': %v", refererUrl.Path, err)
+ }
+
+ // reconstruct the full URL using the referer's scheme, host, and the relative path / queries
+ fullUrl := &url.URL{
+ Scheme: realUrl.Scheme,
+ Host: realUrl.Host,
+ Path: urlQuery.Path,
+ RawQuery: urlQuery.RawQuery,
+ }
+
+ if os.Getenv("LOG_URLS") == "true" {
+ log.Printf("modified relative URL: '%s' -> '%s'", reqUrl, fullUrl.String())
+ }
+ return fullUrl.String(), nil
+
+ }
+
+ // default behavior:
+ // eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
+ return urlQuery.String(), nil
+
+}
+
func ProxySite(rulesetPath string) fiber.Handler {
if rulesetPath != "" {
rs, err := ruleset.NewRuleset(rulesetPath)
@@ -41,7 +94,10 @@ func ProxySite(rulesetPath string) fiber.Handler {
return func(c *fiber.Ctx) error {
// Get the url from the URL
- url := c.Params("*")
+ url, err := extractUrl(c)
+ if err != nil {
+ log.Println("ERROR In URL extraction:", err)
+ }
queries := c.Queries()
body, _, resp, err := fetchSite(url, queries)
@@ -58,6 +114,42 @@ func ProxySite(rulesetPath string) fiber.Handler {
}
}
+func modifyURL(uri string, rule ruleset.Rule) (string, error) {
+ newUrl, err := url.Parse(uri)
+ if err != nil {
+ return "", err
+ }
+
+ for _, urlMod := range rule.UrlMods.Domain {
+ re := regexp.MustCompile(urlMod.Match)
+ newUrl.Host = re.ReplaceAllString(newUrl.Host, urlMod.Replace)
+ }
+
+ for _, urlMod := range rule.UrlMods.Path {
+ re := regexp.MustCompile(urlMod.Match)
+ newUrl.Path = re.ReplaceAllString(newUrl.Path, urlMod.Replace)
+ }
+
+ v := newUrl.Query()
+ for _, query := range rule.UrlMods.Query {
+ if query.Value == "" {
+ v.Del(query.Key)
+ continue
+ }
+ v.Set(query.Key, query.Value)
+ }
+ newUrl.RawQuery = v.Encode()
+
+ if rule.GoogleCache {
+ newUrl, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + newUrl.String())
+ if err != nil {
+ return "", err
+ }
+ }
+
+ return newUrl.String(), nil
+}
+
func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) {
urlQuery := "?"
if len(queries) > 0 {
@@ -81,18 +173,16 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request
log.Println(u.String() + urlQuery)
}
+ // Modify the URI according to ruleset
rule := fetchRule(u.Host, u.Path)
-
- if rule.GoogleCache {
- u, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + u.String())
- if err != nil {
- return "", nil, nil, err
- }
+ url, err := modifyURL(u.String()+urlQuery, rule)
+ if err != nil {
+ return "", nil, nil, err
}
// Fetch the site
client := &http.Client{}
- req, _ := http.NewRequest("GET", u.String()+urlQuery, nil)
+ req, _ := http.NewRequest("GET", url, nil)
if rule.Headers.UserAgent != "" {
req.Header.Set("User-Agent", rule.Headers.UserAgent)
@@ -132,6 +222,7 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request
}
if rule.Headers.CSP != "" {
+ //log.Println(rule.Headers.CSP)
resp.Header.Set("Content-Security-Policy", rule.Headers.CSP)
}
diff --git a/handlers/types.go b/handlers/types.go
deleted file mode 100644
index 16e4c49..0000000
--- a/handlers/types.go
+++ /dev/null
@@ -1,29 +0,0 @@
-package handlers
-
-type Regex struct {
- Match string `yaml:"match"`
- Replace string `yaml:"replace"`
-}
-
-type RuleSet []Rule
-
-type Rule struct {
- Domain string `yaml:"domain,omitempty"`
- Domains []string `yaml:"domains,omitempty"`
- Paths []string `yaml:"paths,omitempty"`
- Headers struct {
- UserAgent string `yaml:"user-agent,omitempty"`
- XForwardedFor string `yaml:"x-forwarded-for,omitempty"`
- Referer string `yaml:"referer,omitempty"`
- Cookie string `yaml:"cookie,omitempty"`
- CSP string `yaml:"content-security-policy,omitempty"`
- } `yaml:"headers,omitempty"`
- GoogleCache bool `yaml:"googleCache,omitempty"`
- RegexRules []Regex `yaml:"regexRules"`
- Injections []struct {
- Position string `yaml:"position"`
- Append string `yaml:"append"`
- Prepend string `yaml:"prepend"`
- Replace string `yaml:"replace"`
- } `yaml:"injections"`
-}
diff --git a/pkg/ruleset/ruleset.go b/pkg/ruleset/ruleset.go
index a4efd3d..9029a65 100644
--- a/pkg/ruleset/ruleset.go
+++ b/pkg/ruleset/ruleset.go
@@ -20,6 +20,10 @@ type Regex struct {
Match string `yaml:"match"`
Replace string `yaml:"replace"`
}
+type KV struct {
+ Key string `yaml:"key"`
+ Value string `yaml:"value"`
+}
type RuleSet []Rule
@@ -36,7 +40,14 @@ type Rule struct {
} `yaml:"headers,omitempty"`
GoogleCache bool `yaml:"googleCache,omitempty"`
RegexRules []Regex `yaml:"regexRules"`
- Injections []struct {
+
+ UrlMods struct {
+ Domain []Regex `yaml:"domain"`
+ Path []Regex `yaml:"path"`
+ Query []KV `yaml:"query"`
+ } `yaml:"urlMods"`
+
+ Injections []struct {
Position string `yaml:"position"`
Append string `yaml:"append"`
Prepend string `yaml:"prepend"`
diff --git a/ruleset.yaml b/ruleset.yaml
index 2953736..572b980 100644
--- a/ruleset.yaml
+++ b/ruleset.yaml
@@ -163,6 +163,14 @@
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
content-security-policy: script-src 'self';
cookie:
+- domain: tagesspiegel.de
+ headers:
+ content-security-policy: script-src 'self';
+ user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
+ urlMods:
+ query:
+ - key: amp
+ value: 1
- domain: www.ft.com
headers:
referer: https://t.co/x?amp=1
@@ -182,4 +190,5 @@
cookie.forEach(el => { el.remove(); });
}, 1000);
})
-
\ No newline at end of file
+
+