Merge pull request #31 from deoxykev/main
Add feature to modify URLs in ruleset | Fix Relative URLs
This commit is contained in:
20
README.md
20
README.md
@@ -17,7 +17,7 @@ Freedom of information is an essential pillar of democracy and informed decision
|
|||||||
### Features
|
### Features
|
||||||
- [x] Bypass Paywalls
|
- [x] Bypass Paywalls
|
||||||
- [x] Remove CORS headers from responses, assets, and images ...
|
- [x] Remove CORS headers from responses, assets, and images ...
|
||||||
- [x] Apply domain based ruleset/code to modify response
|
- [x] Apply domain based ruleset/code to modify response / requested URL
|
||||||
- [x] Keep site browsable
|
- [x] Keep site browsable
|
||||||
- [x] API
|
- [x] API
|
||||||
- [x] Fetch RAW HTML
|
- [x] Fetch RAW HTML
|
||||||
@@ -115,7 +115,7 @@ http://localhost:8080/ruleset
|
|||||||
|
|
||||||
### Ruleset
|
### Ruleset
|
||||||
|
|
||||||
It is possible to apply custom rules to modify the response. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup
|
It is possible to apply custom rules to modify the response or the requested URL. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup
|
||||||
|
|
||||||
See in [ruleset.yaml](ruleset.yaml) for an example.
|
See in [ruleset.yaml](ruleset.yaml) for an example.
|
||||||
|
|
||||||
@@ -154,5 +154,19 @@ See in [ruleset.yaml](ruleset.yaml) for an example.
|
|||||||
<h1>My Custom Title</h1>
|
<h1>My Custom Title</h1>
|
||||||
- position: .left-content article # Position where to inject the code into DOM
|
- position: .left-content article # Position where to inject the code into DOM
|
||||||
prepend: |
|
prepend: |
|
||||||
<h2>Subtitle</h2>
|
<h2>Suptitle</h2>
|
||||||
|
- domain: tagesspiegel.de
|
||||||
|
headers:
|
||||||
|
content-security-policy: script-src 'self';
|
||||||
|
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
|
||||||
|
urlMods: # Modify the URL
|
||||||
|
query:
|
||||||
|
- key: amp # (this will append ?amp=1 to the URL)
|
||||||
|
value: 1
|
||||||
|
domain:
|
||||||
|
- match: www # regex to match part of domain
|
||||||
|
replace: amp # (this would modify the domain from www.tagesspiegel.de to amp.tagesspielgel.de)
|
||||||
|
path:
|
||||||
|
- match: ^ # regex to match part of path
|
||||||
|
replace: /amp/ # (modify the url from https://www.tagesspiegel.de/internationales/ to https://www.tagesspiegel.de/amp/internationales/)
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -22,9 +22,65 @@ var (
|
|||||||
allowedDomains = strings.Split(os.Getenv("ALLOWED_DOMAINS"), ",")
|
allowedDomains = strings.Split(os.Getenv("ALLOWED_DOMAINS"), ",")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// extracts a URL from the request ctx. If the URL in the request
|
||||||
|
// is a relative path, it reconstructs the full URL using the referer header.
|
||||||
|
func extractUrl(c *fiber.Ctx) (string, error) {
|
||||||
|
// try to extract url-encoded
|
||||||
|
reqUrl, err := url.QueryUnescape(c.Params("*"))
|
||||||
|
if err != nil {
|
||||||
|
// fallback
|
||||||
|
reqUrl = c.Params("*")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the actual path from req ctx
|
||||||
|
urlQuery, err := url.Parse(reqUrl)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
isRelativePath := urlQuery.Scheme == ""
|
||||||
|
|
||||||
|
// eg: https://localhost:8080/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
|
||||||
|
if isRelativePath {
|
||||||
|
// Parse the referer URL from the request header.
|
||||||
|
refererUrl, err := url.Parse(c.Get("referer"))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error parsing referer URL from req: '%s': %v", reqUrl, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the real url from referer path
|
||||||
|
realUrl, err := url.Parse(strings.TrimPrefix(refererUrl.Path, "/"))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error parsing real URL from referer '%s': %v", refererUrl.Path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// reconstruct the full URL using the referer's scheme, host, and the relative path / queries
|
||||||
|
fullUrl := &url.URL{
|
||||||
|
Scheme: realUrl.Scheme,
|
||||||
|
Host: realUrl.Host,
|
||||||
|
Path: urlQuery.Path,
|
||||||
|
RawQuery: urlQuery.RawQuery,
|
||||||
|
}
|
||||||
|
|
||||||
|
if os.Getenv("LOG_URLS") == "true" {
|
||||||
|
log.Printf("modified relative URL: '%s' -> '%s'", reqUrl, fullUrl.String())
|
||||||
|
}
|
||||||
|
return fullUrl.String(), nil
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// default behavior:
|
||||||
|
// eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
|
||||||
|
return urlQuery.String(), nil
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func ProxySite(c *fiber.Ctx) error {
|
func ProxySite(c *fiber.Ctx) error {
|
||||||
// Get the url from the URL
|
// Get the url from the URL
|
||||||
url := c.Params("*")
|
url, err := extractUrl(c)
|
||||||
|
if err != nil {
|
||||||
|
log.Println("ERROR In URL extraction:", err)
|
||||||
|
}
|
||||||
|
|
||||||
queries := c.Queries()
|
queries := c.Queries()
|
||||||
body, _, resp, err := fetchSite(url, queries)
|
body, _, resp, err := fetchSite(url, queries)
|
||||||
@@ -40,6 +96,42 @@ func ProxySite(c *fiber.Ctx) error {
|
|||||||
return c.SendString(body)
|
return c.SendString(body)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func modifyURL(uri string, rule Rule) (string, error) {
|
||||||
|
newUrl, err := url.Parse(uri)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, urlMod := range rule.UrlMods.Domain {
|
||||||
|
re := regexp.MustCompile(urlMod.Match)
|
||||||
|
newUrl.Host = re.ReplaceAllString(newUrl.Host, urlMod.Replace)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, urlMod := range rule.UrlMods.Path {
|
||||||
|
re := regexp.MustCompile(urlMod.Match)
|
||||||
|
newUrl.Path = re.ReplaceAllString(newUrl.Path, urlMod.Replace)
|
||||||
|
}
|
||||||
|
|
||||||
|
v := newUrl.Query()
|
||||||
|
for _, query := range rule.UrlMods.Query {
|
||||||
|
if query.Value == "" {
|
||||||
|
v.Del(query.Key)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
v.Set(query.Key, query.Value)
|
||||||
|
}
|
||||||
|
newUrl.RawQuery = v.Encode()
|
||||||
|
|
||||||
|
if rule.GoogleCache {
|
||||||
|
newUrl, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + newUrl.String())
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newUrl.String(), nil
|
||||||
|
}
|
||||||
|
|
||||||
func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) {
|
func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) {
|
||||||
urlQuery := "?"
|
urlQuery := "?"
|
||||||
if len(queries) > 0 {
|
if len(queries) > 0 {
|
||||||
@@ -63,18 +155,16 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request
|
|||||||
log.Println(u.String() + urlQuery)
|
log.Println(u.String() + urlQuery)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Modify the URI according to ruleset
|
||||||
rule := fetchRule(u.Host, u.Path)
|
rule := fetchRule(u.Host, u.Path)
|
||||||
|
url, err := modifyURL(u.String()+urlQuery, rule)
|
||||||
if rule.GoogleCache {
|
if err != nil {
|
||||||
u, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + u.String())
|
return "", nil, nil, err
|
||||||
if err != nil {
|
|
||||||
return "", nil, nil, err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fetch the site
|
// Fetch the site
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
req, _ := http.NewRequest("GET", u.String()+urlQuery, nil)
|
req, _ := http.NewRequest("GET", url, nil)
|
||||||
|
|
||||||
if rule.Headers.UserAgent != "" {
|
if rule.Headers.UserAgent != "" {
|
||||||
req.Header.Set("User-Agent", rule.Headers.UserAgent)
|
req.Header.Set("User-Agent", rule.Headers.UserAgent)
|
||||||
@@ -114,6 +204,7 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request
|
|||||||
}
|
}
|
||||||
|
|
||||||
if rule.Headers.CSP != "" {
|
if rule.Headers.CSP != "" {
|
||||||
|
log.Println(rule.Headers.CSP)
|
||||||
resp.Header.Set("Content-Security-Policy", rule.Headers.CSP)
|
resp.Header.Set("Content-Security-Policy", rule.Headers.CSP)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,10 @@ type Regex struct {
|
|||||||
Match string `yaml:"match"`
|
Match string `yaml:"match"`
|
||||||
Replace string `yaml:"replace"`
|
Replace string `yaml:"replace"`
|
||||||
}
|
}
|
||||||
|
type KV struct {
|
||||||
|
Key string `yaml:"key"`
|
||||||
|
Value string `yaml:"value"`
|
||||||
|
}
|
||||||
|
|
||||||
type RuleSet []Rule
|
type RuleSet []Rule
|
||||||
|
|
||||||
@@ -20,7 +24,14 @@ type Rule struct {
|
|||||||
} `yaml:"headers,omitempty"`
|
} `yaml:"headers,omitempty"`
|
||||||
GoogleCache bool `yaml:"googleCache,omitempty"`
|
GoogleCache bool `yaml:"googleCache,omitempty"`
|
||||||
RegexRules []Regex `yaml:"regexRules"`
|
RegexRules []Regex `yaml:"regexRules"`
|
||||||
Injections []struct {
|
|
||||||
|
UrlMods struct {
|
||||||
|
Domain []Regex `yaml:"domain"`
|
||||||
|
Path []Regex `yaml:"path"`
|
||||||
|
Query []KV `yaml:"query"`
|
||||||
|
} `yaml:"urlMods"`
|
||||||
|
|
||||||
|
Injections []struct {
|
||||||
Position string `yaml:"position"`
|
Position string `yaml:"position"`
|
||||||
Append string `yaml:"append"`
|
Append string `yaml:"append"`
|
||||||
Prepend string `yaml:"prepend"`
|
Prepend string `yaml:"prepend"`
|
||||||
|
|||||||
11
ruleset.yaml
11
ruleset.yaml
@@ -163,6 +163,14 @@
|
|||||||
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
|
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
|
||||||
content-security-policy: script-src 'self';
|
content-security-policy: script-src 'self';
|
||||||
cookie:
|
cookie:
|
||||||
|
- domain: tagesspiegel.de
|
||||||
|
headers:
|
||||||
|
content-security-policy: script-src 'self';
|
||||||
|
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
|
||||||
|
urlMods:
|
||||||
|
query:
|
||||||
|
- key: amp
|
||||||
|
value: 1
|
||||||
- domain: www.ft.com
|
- domain: www.ft.com
|
||||||
headers:
|
headers:
|
||||||
referer: https://t.co/x?amp=1
|
referer: https://t.co/x?amp=1
|
||||||
@@ -182,4 +190,5 @@
|
|||||||
cookie.forEach(el => { el.remove(); });
|
cookie.forEach(el => { el.remove(); });
|
||||||
}, 1000);
|
}, 1000);
|
||||||
})
|
})
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user