package rewriters import ( "bytes" _ "embed" "fmt" "io" "log" "net/url" "strings" "golang.org/x/net/html" ) var attributesToRewrite map[string]bool var schemeBlacklist map[string]bool func init() { // Define list of HTML attributes to try to rewrite attributesToRewrite = map[string]bool{ "src": true, "href": true, "action": true, "srcset": true, "poster": true, "data": true, "cite": true, "formaction": true, "background": true, "usemap": true, "longdesc": true, "manifest": true, "archive": true, "codebase": true, "icon": true, "pluginspage": true, } // define URIs to NOT rewrite // for example: don't overwrite " schemeBlacklist = map[string]bool{ "data": true, "tel": true, "mailto": true, "file": true, "blob": true, "javascript": true, "about": true, "magnet": true, "ws": true, "wss": true, "ftp": true, } } // HTMLResourceURLRewriter is a struct that rewrites URLs within HTML resources to use a specified proxy URL. // It uses an HTML tokenizer to process HTML content and rewrites URLs in src/href attributes. // -> type HTMLResourceURLRewriter struct { baseURL *url.URL tokenizer *html.Tokenizer currentToken html.Token tokenBuffer *bytes.Buffer scriptContentBuffer *bytes.Buffer insideScript bool currentTokenIndex int currentTokenProcessed bool proxyURL string // ladder URL, not proxied site URL } // NewHTMLResourceURLRewriter creates a new instance of HTMLResourceURLRewriter. // It initializes the tokenizer with the provided source and sets the proxy URL. func NewHTMLResourceURLRewriter(src io.ReadCloser, baseURL *url.URL, proxyURL string) *HTMLResourceURLRewriter { return &HTMLResourceURLRewriter{ tokenizer: html.NewTokenizer(src), currentToken: html.Token{}, currentTokenIndex: 0, tokenBuffer: new(bytes.Buffer), scriptContentBuffer: new(bytes.Buffer), insideScript: false, baseURL: baseURL, proxyURL: proxyURL, } } // Close resets the internal state of HTMLResourceURLRewriter, clearing buffers and token data. func (r *HTMLResourceURLRewriter) Close() error { r.tokenBuffer.Reset() r.currentToken = html.Token{} r.currentTokenIndex = 0 r.currentTokenProcessed = false return nil } // Read processes the HTML content, rewriting URLs and managing the state of tokens. // It reads HTML content, token by token, rewriting URLs to route through the specified proxy. func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) { if r.currentToken.Data == "" || r.currentTokenProcessed { tokenType := r.tokenizer.Next() // done reading html, close out reader if tokenType == html.ErrorToken { if r.tokenizer.Err() == io.EOF { return 0, io.EOF } return 0, r.tokenizer.Err() } // flush the current token into an internal buffer // to handle fragmented tokens r.currentToken = r.tokenizer.Token() // patch tokens with URLs isTokenWithAttribute := r.currentToken.Type == html.StartTagToken || r.currentToken.Type == html.SelfClosingTagToken if isTokenWithAttribute { patchResourceURL(&r.currentToken, r.baseURL, r.proxyURL) } r.tokenBuffer.Reset() // unescape script contents, not sure why tokenizer will escape things switch tokenType { case html.StartTagToken: if r.currentToken.Data == "script" { r.insideScript = true r.scriptContentBuffer.Reset() // Reset buffer for new script contents } r.tokenBuffer.WriteString(r.currentToken.String()) // Write the start tag case html.EndTagToken: if r.currentToken.Data == "script" { r.insideScript = false modScript := modifyInlineScript(r.scriptContentBuffer) r.tokenBuffer.WriteString(modScript) } r.tokenBuffer.WriteString(r.currentToken.String()) default: if r.insideScript { r.scriptContentBuffer.WriteString(r.currentToken.String()) } else { r.tokenBuffer.WriteString(r.currentToken.String()) } } // inject \n", script), ) } func injectScriptWithParams(tokenBuffer *bytes.Buffer, script string, params map[string]string) { for old, new := range params { script = strings.ReplaceAll(script, old, new) } tokenBuffer.WriteString( fmt.Sprintf("\n\n", script), ) } // possible ad-blocking / bypassing opportunity here func modifyInlineScript(scriptContentBuffer *bytes.Buffer) string { return html.UnescapeString(scriptContentBuffer.String()) } // Root-relative URLs: These are relative to the root path and start with a "/". func handleRootRelativePath(attr *html.Attribute, baseURL *url.URL) { // doublecheck this is a valid relative URL _, err := url.Parse(fmt.Sprintf("http://localhost.com%s", attr.Val)) if err != nil { return } //log.Printf("BASEURL patch: %s\n", baseURL) attr.Val = fmt.Sprintf( "/%s://%s/%s", baseURL.Scheme, baseURL.Host, strings.TrimPrefix(attr.Val, "/"), ) attr.Val = url.QueryEscape(attr.Val) attr.Val = fmt.Sprintf("/%s", attr.Val) log.Printf("root rel url rewritten-> '%s'='%s'", attr.Key, attr.Val) } // Document-relative URLs: These are relative to the current document's path and don't start with a "/". func handleDocumentRelativePath(attr *html.Attribute, baseURL *url.URL) { attr.Val = fmt.Sprintf( "%s://%s/%s%s", baseURL.Scheme, strings.Trim(baseURL.Host, "/"), strings.Trim(baseURL.RawPath, "/"), strings.Trim(attr.Val, "/"), ) attr.Val = url.QueryEscape(attr.Val) attr.Val = fmt.Sprintf("/%s", attr.Val) log.Printf("doc rel url rewritten-> '%s'='%s'", attr.Key, attr.Val) } // Protocol-relative URLs: These start with "//" and will use the same protocol (http or https) as the current page. func handleProtocolRelativePath(attr *html.Attribute, baseURL *url.URL) { attr.Val = strings.TrimPrefix(attr.Val, "/") handleRootRelativePath(attr, baseURL) log.Printf("proto rel url rewritten-> '%s'='%s'", attr.Key, attr.Val) } func handleAbsolutePath(attr *html.Attribute, baseURL *url.URL) { // check if valid URL u, err := url.Parse(attr.Val) if err != nil { return } if !(u.Scheme == "http" || u.Scheme == "https") { return } attr.Val = fmt.Sprintf( "/%s", url.QueryEscape( strings.TrimPrefix(attr.Val, "/"), ), ) log.Printf("abs url rewritten-> '%s'='%s'", attr.Key, attr.Val) } func handleSrcSet(attr *html.Attribute, baseURL *url.URL) { for i, src := range strings.Split(attr.Val, ",") { src = strings.Trim(src, " ") for j, s := range strings.Split(src, " ") { s = strings.Trim(s, " ") if j == 0 { f := &html.Attribute{Val: s, Key: attr.Key} switch { case strings.HasPrefix(s, "//"): handleProtocolRelativePath(f, baseURL) case strings.HasPrefix(s, "/"): handleRootRelativePath(f, baseURL) case strings.HasPrefix(s, "https://") || strings.HasPrefix(s, "http://"): handleAbsolutePath(f, baseURL) default: handleDocumentRelativePath(f, baseURL) } s = f.Val } if i == 0 && j == 0 { attr.Val = s continue } attr.Val = fmt.Sprintf("%s %s", attr.Val, s) } attr.Val = fmt.Sprintf("%s,", attr.Val) } attr.Val = strings.TrimSuffix(attr.Val, ",") log.Printf("srcset url rewritten-> '%s'='%s'", attr.Key, attr.Val) } func isBlackedlistedScheme(url string) bool { spl := strings.Split(url, ":") if len(spl) == 0 { return false } scheme := spl[0] return schemeBlacklist[scheme] } func patchResourceURL(token *html.Token, baseURL *url.URL, proxyURL string) { for i := range token.Attr { attr := &token.Attr[i] switch { // don't touch attributes except for the ones we defined case !attributesToRewrite[attr.Key]: continue // don't rewrite special URIs that don't make network requests case isBlackedlistedScheme(attr.Val): continue // don't double-overwrite the url case strings.HasPrefix(attr.Val, proxyURL): continue case attr.Key == "srcset": handleSrcSet(attr, baseURL) continue case strings.HasPrefix(attr.Val, "//"): handleProtocolRelativePath(attr, baseURL) continue case strings.HasPrefix(attr.Val, "/"): handleRootRelativePath(attr, baseURL) continue case strings.HasPrefix(attr.Val, "https://") || strings.HasPrefix(attr.Val, "http://"): handleAbsolutePath(attr, baseURL) continue default: handleDocumentRelativePath(attr, baseURL) continue } } }