Add feature to modify URLs in ruleset
This commit is contained in:
18
README.md
18
README.md
@@ -17,7 +17,7 @@ Freedom of information is an essential pillar of democracy and informed decision
|
||||
### Features
|
||||
- [x] Bypass Paywalls
|
||||
- [x] Remove CORS headers from responses, assets, and images ...
|
||||
- [x] Apply domain based ruleset/code to modify response
|
||||
- [x] Apply domain based ruleset/code to modify response / requested URL
|
||||
- [x] Keep site browsable
|
||||
- [x] API
|
||||
- [x] Fetch RAW HTML
|
||||
@@ -115,7 +115,7 @@ http://localhost:8080/ruleset
|
||||
|
||||
### Ruleset
|
||||
|
||||
It is possible to apply custom rules to modify the response. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup
|
||||
It is possible to apply custom rules to modify the response or the requested URL. This can be used to remove unwanted or modify elements from the page. The ruleset is a YAML file that contains a list of rules for each domain and is loaded on startup
|
||||
|
||||
See in [ruleset.yaml](ruleset.yaml) for an example.
|
||||
|
||||
@@ -155,4 +155,18 @@ See in [ruleset.yaml](ruleset.yaml) for an example.
|
||||
- position: .left-content article # Position where to inject the code into DOM
|
||||
prepend: |
|
||||
<h2>Suptitle</h2>
|
||||
- domain: tagesspiegel.de
|
||||
headers:
|
||||
content-security-policy: script-src 'self';
|
||||
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
|
||||
urlMods: # Modify the URL
|
||||
query:
|
||||
- key: amp # (this will append ?amp=1 to the URL)
|
||||
value: 1
|
||||
domain:
|
||||
- match: www # regex to match part of domain
|
||||
replace: amp # (this would modify the domain from www.tagesspiegel.de to amp.tagesspielgel.de)
|
||||
path:
|
||||
- match: ^ # regex to match part of path
|
||||
replace: /amp/ # (modify the url from https://www.tagesspiegel.de/internationales/ to https://www.tagesspiegel.de/amp/internationales/)
|
||||
```
|
||||
|
||||
@@ -40,6 +40,41 @@ func ProxySite(c *fiber.Ctx) error {
|
||||
return c.SendString(body)
|
||||
}
|
||||
|
||||
func modifyURL(uri string, rule Rule) (string, error) {
|
||||
newUrl, err := url.Parse(uri)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
for _, urlMod := range rule.UrlMods.Domain {
|
||||
re := regexp.MustCompile(urlMod.Match)
|
||||
newUrl.Host = re.ReplaceAllString(newUrl.Host, urlMod.Replace)
|
||||
}
|
||||
|
||||
for _, urlMod := range rule.UrlMods.Path {
|
||||
re := regexp.MustCompile(urlMod.Match)
|
||||
newUrl.Path = re.ReplaceAllString(newUrl.Path, urlMod.Replace)
|
||||
}
|
||||
|
||||
v := newUrl.Query()
|
||||
for _, query := range rule.UrlMods.Query {
|
||||
if query.Value == "" {
|
||||
v.Del(query.Key)
|
||||
continue
|
||||
}
|
||||
v.Set(query.Key, query.Value)
|
||||
}
|
||||
newUrl.RawQuery = v.Encode()
|
||||
|
||||
if rule.GoogleCache {
|
||||
newUrl, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + newUrl.String())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
return newUrl.String(), nil
|
||||
}
|
||||
|
||||
func fetchSite(urlpath string, queries map[string]string) (string, *http.Request, *http.Response, error) {
|
||||
urlQuery := "?"
|
||||
if len(queries) > 0 {
|
||||
@@ -63,18 +98,17 @@ func fetchSite(urlpath string, queries map[string]string) (string, *http.Request
|
||||
log.Println(u.String() + urlQuery)
|
||||
}
|
||||
|
||||
// Modify the URI according to ruleset
|
||||
rule := fetchRule(u.Host, u.Path)
|
||||
|
||||
if rule.GoogleCache {
|
||||
u, err = url.Parse("https://webcache.googleusercontent.com/search?q=cache:" + u.String())
|
||||
if err != nil {
|
||||
return "", nil, nil, err
|
||||
}
|
||||
url, err := modifyURL(u.String()+urlQuery, rule)
|
||||
if err != nil {
|
||||
return "", nil, nil, err
|
||||
}
|
||||
log.Println("fetch URI: %", url)
|
||||
|
||||
// Fetch the site
|
||||
client := &http.Client{}
|
||||
req, _ := http.NewRequest("GET", u.String()+urlQuery, nil)
|
||||
req, _ := http.NewRequest("GET", url, nil)
|
||||
|
||||
if rule.Headers.UserAgent != "" {
|
||||
req.Header.Set("User-Agent", rule.Headers.UserAgent)
|
||||
|
||||
@@ -4,6 +4,10 @@ type Regex struct {
|
||||
Match string `yaml:"match"`
|
||||
Replace string `yaml:"replace"`
|
||||
}
|
||||
type KV struct {
|
||||
Key string `yaml:"key"`
|
||||
Value string `yaml:"value"`
|
||||
}
|
||||
|
||||
type RuleSet []Rule
|
||||
|
||||
@@ -20,7 +24,14 @@ type Rule struct {
|
||||
} `yaml:"headers,omitempty"`
|
||||
GoogleCache bool `yaml:"googleCache,omitempty"`
|
||||
RegexRules []Regex `yaml:"regexRules"`
|
||||
Injections []struct {
|
||||
|
||||
UrlMods struct {
|
||||
Domain []Regex `yaml:"domain"`
|
||||
Path []Regex `yaml:"path"`
|
||||
Query []KV `yaml:"query"`
|
||||
} `yaml:"urlMods"`
|
||||
|
||||
Injections []struct {
|
||||
Position string `yaml:"position"`
|
||||
Append string `yaml:"append"`
|
||||
Prepend string `yaml:"prepend"`
|
||||
|
||||
@@ -163,3 +163,11 @@
|
||||
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
|
||||
content-security-policy: script-src 'self';
|
||||
cookie:
|
||||
- domain: tagesspiegel.de
|
||||
headers:
|
||||
content-security-policy: script-src 'self';
|
||||
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36
|
||||
urlMods:
|
||||
query:
|
||||
- key: amp
|
||||
value: 1
|
||||
|
||||
Reference in New Issue
Block a user