265 lines
8.2 KiB
Go
265 lines
8.2 KiB
Go
package rewriters
|
|
|
|
import (
|
|
_ "embed"
|
|
"fmt"
|
|
"golang.org/x/net/html/atom"
|
|
"log"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
var rewriteAttrs map[string]map[string]bool
|
|
var specialRewriteAttrs map[string]map[string]bool
|
|
var schemeBlacklist map[string]bool
|
|
|
|
func init() {
|
|
// define all tag/attributes which might contain URLs
|
|
// to attempt to rewrite to point to proxy instead
|
|
rewriteAttrs = map[string]map[string]bool{
|
|
"img": {"src": true, "srcset": true, "longdesc": true, "usemap": true},
|
|
"a": {"href": true},
|
|
"form": {"action": true},
|
|
"link": {"href": true, "manifest": true, "icon": true},
|
|
"script": {"src": true},
|
|
"video": {"src": true, "poster": true},
|
|
"audio": {"src": true},
|
|
"iframe": {"src": true, "longdesc": true},
|
|
"embed": {"src": true},
|
|
"object": {"data": true, "codebase": true},
|
|
"source": {"src": true, "srcset": true},
|
|
"track": {"src": true},
|
|
"area": {"href": true},
|
|
"base": {"href": true},
|
|
"blockquote": {"cite": true},
|
|
"del": {"cite": true},
|
|
"ins": {"cite": true},
|
|
"q": {"cite": true},
|
|
"body": {"background": true},
|
|
"button": {"formaction": true},
|
|
"input": {"src": true, "formaction": true},
|
|
"meta": {"content": true},
|
|
}
|
|
|
|
// might contain URL but requires special handling
|
|
specialRewriteAttrs = map[string]map[string]bool{
|
|
"img": {"srcset": true},
|
|
"source": {"srcset": true},
|
|
"meta": {"content": true},
|
|
}
|
|
|
|
// define URIs to NOT rewrite
|
|
// for example: don't overwrite <img src="data:image/png;base64;iVBORw...">"
|
|
schemeBlacklist = map[string]bool{
|
|
"data": true,
|
|
"tel": true,
|
|
"mailto": true,
|
|
"file": true,
|
|
"blob": true,
|
|
"javascript": true,
|
|
"about": true,
|
|
"magnet": true,
|
|
"ws": true,
|
|
"wss": true,
|
|
"ftp": true,
|
|
}
|
|
|
|
}
|
|
|
|
// HTMLTokenURLRewriter implements HTMLTokenRewriter
|
|
// it rewrites URLs within HTML resources to use a specified proxy URL.
|
|
// <img src='/relative_path'> -> <img src='/https://proxiedsite.com/relative_path'>
|
|
type HTMLTokenURLRewriter struct {
|
|
baseURL *url.URL
|
|
proxyURL string // ladder URL, not proxied site URL
|
|
}
|
|
|
|
// NewHTMLTokenURLRewriter creates a new instance of HTMLResourceURLRewriter.
|
|
// It initializes the tokenizer with the provided source and sets the proxy URL.
|
|
func NewHTMLTokenURLRewriter(baseURL *url.URL, proxyURL string) *HTMLTokenURLRewriter {
|
|
return &HTMLTokenURLRewriter{
|
|
baseURL: baseURL,
|
|
proxyURL: proxyURL,
|
|
}
|
|
}
|
|
|
|
func (r *HTMLTokenURLRewriter) ShouldModify(token *html.Token) bool {
|
|
attrLen := len(token.Attr)
|
|
if attrLen == 0 {
|
|
return false
|
|
}
|
|
if !(token.Type == html.StartTagToken || token.Type == html.SelfClosingTagToken) {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (r *HTMLTokenURLRewriter) ModifyToken(token *html.Token) (string, string) {
|
|
for i := range token.Attr {
|
|
attr := &token.Attr[i]
|
|
switch {
|
|
// don't touch tag/attributes that don't contain URIs
|
|
case !rewriteAttrs[token.Data][attr.Key]:
|
|
continue
|
|
// don't touch attributes with special URIs (like data:)
|
|
case schemeBlacklist[strings.Split(attr.Key, ":")[0]]:
|
|
continue
|
|
// don't double-overwrite the url
|
|
case strings.HasPrefix(attr.Val, r.proxyURL):
|
|
continue
|
|
case strings.HasPrefix(attr.Val, "/http://"):
|
|
continue
|
|
case strings.HasPrefix(attr.Val, "/https://"):
|
|
continue
|
|
// handle special rewrites
|
|
case specialRewriteAttrs[token.Data][attr.Key]:
|
|
r.handleSpecialAttr(token, attr, r.baseURL)
|
|
continue
|
|
default:
|
|
// rewrite url
|
|
handleURLPart(attr, r.baseURL)
|
|
}
|
|
}
|
|
return "", ""
|
|
}
|
|
|
|
// dispatcher for ModifyURL based on URI type
|
|
func handleURLPart(attr *html.Attribute, baseURL *url.URL) {
|
|
switch {
|
|
case strings.HasPrefix(attr.Val, "//"):
|
|
handleProtocolRelativePath(attr, baseURL)
|
|
case strings.HasPrefix(attr.Val, "/"):
|
|
handleRootRelativePath(attr, baseURL)
|
|
case strings.HasPrefix(attr.Val, "https://"):
|
|
handleAbsolutePath(attr, baseURL)
|
|
case strings.HasPrefix(attr.Val, "http://"):
|
|
handleAbsolutePath(attr, baseURL)
|
|
default:
|
|
handleDocumentRelativePath(attr, baseURL)
|
|
}
|
|
}
|
|
|
|
// Protocol-relative URLs: These start with "//" and will use the same protocol (http or https) as the current page.
|
|
func handleProtocolRelativePath(attr *html.Attribute, baseURL *url.URL) {
|
|
attr.Val = strings.TrimPrefix(attr.Val, "/")
|
|
handleRootRelativePath(attr, baseURL)
|
|
log.Printf("proto rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
|
|
}
|
|
|
|
// Root-relative URLs: These are relative to the root path and start with a "/".
|
|
func handleRootRelativePath(attr *html.Attribute, baseURL *url.URL) {
|
|
// doublecheck this is a valid relative URL
|
|
log.Printf("PROCESSING: key: %s val: %s\n", attr.Key, attr.Val)
|
|
_, err := url.Parse(fmt.Sprintf("http://localhost.com%s", attr.Val))
|
|
if err != nil {
|
|
log.Println(err)
|
|
return
|
|
}
|
|
|
|
//log.Printf("BASEURL patch: %s\n", baseURL)
|
|
|
|
attr.Val = fmt.Sprintf(
|
|
"/%s://%s/%s",
|
|
baseURL.Scheme,
|
|
baseURL.Host,
|
|
strings.TrimPrefix(attr.Val, "/"),
|
|
)
|
|
attr.Val = escape(attr.Val)
|
|
attr.Val = fmt.Sprintf("/%s", attr.Val)
|
|
|
|
log.Printf("root rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
|
|
}
|
|
|
|
// Document-relative URLs: These are relative to the current document's path and don't start with a "/".
|
|
func handleDocumentRelativePath(attr *html.Attribute, baseURL *url.URL) {
|
|
log.Printf("PROCESSING: key: %s val: %s\n", attr.Key, attr.Val)
|
|
attr.Val = fmt.Sprintf(
|
|
"%s://%s/%s%s",
|
|
baseURL.Scheme,
|
|
strings.Trim(baseURL.Host, "/"),
|
|
strings.Trim(baseURL.RawPath, "/"),
|
|
strings.Trim(attr.Val, "/"),
|
|
)
|
|
attr.Val = escape(attr.Val)
|
|
attr.Val = fmt.Sprintf("/%s", attr.Val)
|
|
log.Printf("doc rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
|
|
}
|
|
|
|
// full URIs beginning with https?://proxiedsite.com
|
|
func handleAbsolutePath(attr *html.Attribute, baseURL *url.URL) {
|
|
// check if valid URL
|
|
log.Printf("PROCESSING: key: %s val: %s\n", attr.Key, attr.Val)
|
|
u, err := url.Parse(attr.Val)
|
|
if err != nil {
|
|
return
|
|
}
|
|
if !(u.Scheme == "http" || u.Scheme == "https") {
|
|
return
|
|
}
|
|
attr.Val = fmt.Sprintf("/%s", escape(strings.TrimPrefix(attr.Val, "/")))
|
|
log.Printf("abs url rewritten-> '%s'='%s'", attr.Key, attr.Val)
|
|
}
|
|
|
|
// handle edge cases for special attributes
|
|
func (r *HTMLTokenURLRewriter) handleSpecialAttr(token *html.Token, attr *html.Attribute, baseURL *url.URL) {
|
|
switch {
|
|
// srcset attribute doesn't contain a single URL but a comma-separated list of URLs, each potentially followed by a space and a descriptor (like a width, pixel density, or other conditions).
|
|
case token.DataAtom == atom.Img && attr.Key == "srcset":
|
|
handleSrcSet(attr, baseURL)
|
|
case token.DataAtom == atom.Source && attr.Key == "srcset":
|
|
handleSrcSet(attr, baseURL)
|
|
// meta with http-equiv="refresh": The content attribute of a meta tag, when used for a refresh directive, contains a time interval followed by a URL, like content="5;url=http://example.com/".
|
|
case token.DataAtom == atom.Meta && attr.Key == "content" && regexp.MustCompile(`^\d+;url=`).MatchString(attr.Val):
|
|
handleMetaRefresh(attr, baseURL)
|
|
default:
|
|
break
|
|
}
|
|
}
|
|
|
|
func handleMetaRefresh(attr *html.Attribute, baseURL *url.URL) {
|
|
sec := strings.Split(attr.Val, ";url=")[0]
|
|
url := strings.Split(attr.Val, ";url=")[1]
|
|
f := &html.Attribute{Val: url, Key: "src"}
|
|
handleURLPart(f, baseURL)
|
|
attr.Val = fmt.Sprintf("%s;url=%s", sec, url)
|
|
}
|
|
|
|
func handleSrcSet(attr *html.Attribute, baseURL *url.URL) {
|
|
var srcSetBuilder strings.Builder
|
|
srcSetItems := strings.Split(attr.Val, ",")
|
|
|
|
for i, srcItem := range srcSetItems {
|
|
srcParts := strings.Fields(srcItem) // Fields splits around whitespace, trimming them
|
|
|
|
if len(srcParts) == 0 {
|
|
continue // skip empty items
|
|
}
|
|
|
|
// rewrite each URL part by passing in fake attribute
|
|
f := &html.Attribute{Val: srcParts[0], Key: "src"}
|
|
handleURLPart(f, baseURL)
|
|
urlPart := f.Key
|
|
|
|
// First srcset item without a descriptor
|
|
if i == 0 && (len(srcParts) == 1 || !strings.HasSuffix(srcParts[1], "x")) {
|
|
srcSetBuilder.WriteString(urlPart)
|
|
} else {
|
|
srcSetBuilder.WriteString(fmt.Sprintf("%s %s", urlPart, srcParts[1]))
|
|
}
|
|
|
|
if i < len(srcSetItems)-1 {
|
|
srcSetBuilder.WriteString(",") // Add comma for all but last item
|
|
}
|
|
}
|
|
|
|
attr.Val = srcSetBuilder.String()
|
|
log.Printf("srcset url rewritten-> '%s'='%s'", attr.Key, attr.Val)
|
|
}
|
|
|
|
func escape(str string) string {
|
|
return strings.ReplaceAll(url.PathEscape(str), "%2F", "/")
|
|
}
|