hadrian/proxychain/responsemodifers/rewriters/html_resource_url_rewriter.go

package rewriters

import (
	"bytes"
	_ "embed"
	"fmt"
	"io"
	"log"
	"net/url"
	"strings"

	"golang.org/x/net/html"
)

var attributesToRewrite map[string]bool
var schemeBlacklist map[string]bool

func init() {
	// Define list of HTML attributes to try to rewrite
	attributesToRewrite = map[string]bool{
		"src":         true,
		"href":        true,
		"action":      true,
		"srcset":      true,
		"poster":      true,
		"data":        true,
		"cite":        true,
		"formaction":  true,
		"background":  true,
		"usemap":      true,
		"longdesc":    true,
		"manifest":    true,
		"archive":     true,
		"codebase":    true,
		"icon":        true,
		"pluginspage": true,
	}

	// define URIs to NOT rewrite
	// for example: don't overwrite <img src="data:image/png;base64;iVBORw...">"
	schemeBlacklist = map[string]bool{
		"data":       true,
		"tel":        true,
		"mailto":     true,
		"file":       true,
		"blob":       true,
		"javascript": true,
		"about":      true,
		"magnet":     true,
		"ws":         true,
		"wss":        true,
		"ftp":        true,
	}
}

// HTMLResourceURLRewriter is a struct that rewrites URLs within HTML resources to use a specified proxy URL.
// It uses an HTML tokenizer to process HTML content and rewrites URLs in src/href attributes.
// <img src='/relative_path'> -> <img src='/https://proxiedsite.com/relative_path'>
type HTMLResourceURLRewriter struct {
	baseURL               *url.URL
	tokenizer             *html.Tokenizer
	currentToken          html.Token
	tokenBuffer           *bytes.Buffer
	scriptContentBuffer   *bytes.Buffer
	insideScript          bool
	currentTokenIndex     int
	currentTokenProcessed bool
	proxyURL              string // ladder URL, not proxied site URL
}

// NewHTMLResourceURLRewriter creates a new instance of HTMLResourceURLRewriter.
// It initializes the tokenizer with the provided source and sets the proxy URL.
func NewHTMLResourceURLRewriter(src io.ReadCloser, baseURL *url.URL, proxyURL string) *HTMLResourceURLRewriter {
	return &HTMLResourceURLRewriter{
		tokenizer:           html.NewTokenizer(src),
		currentToken:        html.Token{},
		currentTokenIndex:   0,
		tokenBuffer:         new(bytes.Buffer),
		scriptContentBuffer: new(bytes.Buffer),
		insideScript:        false,
		baseURL:             baseURL,
		proxyURL:            proxyURL,
	}
}

// Close resets the internal state of HTMLResourceURLRewriter, clearing buffers and token data.
func (r *HTMLResourceURLRewriter) Close() error {
	r.tokenBuffer.Reset()
	r.currentToken = html.Token{}
	r.currentTokenIndex = 0
	r.currentTokenProcessed = false
	return nil
}

// Read processes the HTML content, rewriting URLs and managing the state of tokens.
// It reads HTML content, token by token, rewriting URLs to route through the specified proxy.
func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) {

	if r.currentToken.Data == "" || r.currentTokenProcessed {
		tokenType := r.tokenizer.Next()

		// done reading html, close out reader
		if tokenType == html.ErrorToken {
			if r.tokenizer.Err() == io.EOF {
				return 0, io.EOF
			}
			return 0, r.tokenizer.Err()
		}

		// flush the current token into an internal buffer
		// to handle fragmented tokens
		r.currentToken = r.tokenizer.Token()

		// patch tokens with URLs
		isTokenWithAttribute := r.currentToken.Type == html.StartTagToken || r.currentToken.Type == html.SelfClosingTagToken
		if isTokenWithAttribute {
			patchResourceURL(&r.currentToken, r.baseURL, r.proxyURL)
		}

		r.tokenBuffer.Reset()

		// unescape script contents, not sure why tokenizer will escape things
		switch tokenType {
		case html.StartTagToken:
			if r.currentToken.Data == "script" {
				r.insideScript = true
				r.scriptContentBuffer.Reset() // Reset buffer for new script contents
			}
			r.tokenBuffer.WriteString(r.currentToken.String()) // Write the start tag
		case html.EndTagToken:
			if r.currentToken.Data == "script" {
				r.insideScript = false
				modScript := modifyInlineScript(r.scriptContentBuffer)
				r.tokenBuffer.WriteString(modScript)
			}
			r.tokenBuffer.WriteString(r.currentToken.String())
		default:
			if r.insideScript {
				r.scriptContentBuffer.WriteString(r.currentToken.String())
			} else {
				r.tokenBuffer.WriteString(r.currentToken.String())
			}
		}

		// inject <script> right after <head>
		isHeadToken := (r.currentToken.Type == html.StartTagToken || r.currentToken.Type == html.SelfClosingTagToken) && r.currentToken.Data == "head"
		if isHeadToken {
			params := map[string]string{
				"R_PROXYURL": r.proxyURL,
				"R_BASEURL":  fmt.Sprintf("%s://%s", r.baseURL.Scheme, r.baseURL.Host),
			}
			injectScriptWithParams(r.tokenBuffer, rewriteJSResourceUrlsScript, params)
		}

		r.currentTokenProcessed = false
		r.currentTokenIndex = 0
	}

	n, err := r.tokenBuffer.Read(p)
	if err == io.EOF || r.tokenBuffer.Len() == 0 {
		r.currentTokenProcessed = true
		err = nil // EOF in this context is expected and not an actual error
	}
	return n, err
}

// fetch("/relative_script.js") -> fetch("http://localhost:8080/relative_script.js")
//
//go:embed js_resource_url_rewriter.js
var rewriteJSResourceUrlsScript string

func injectScript(tokenBuffer *bytes.Buffer, script string) {
	tokenBuffer.WriteString(
		fmt.Sprintf("\n<script>\n%s\n</script>\n", script),
	)
}

func injectScriptWithParams(tokenBuffer *bytes.Buffer, script string, params map[string]string) {
	for old, new := range params {
		script = strings.ReplaceAll(script, old, new)
	}
	tokenBuffer.WriteString(
		fmt.Sprintf("\n<script>\n%s\n</script>\n", script),
	)
}

// possible ad-blocking / bypassing opportunity here
func modifyInlineScript(scriptContentBuffer *bytes.Buffer) string {
	return html.UnescapeString(scriptContentBuffer.String())
}

// Root-relative URLs: These are relative to the root path and start with a "/".
func handleRootRelativePath(attr *html.Attribute, baseURL *url.URL) {
	// doublecheck this is a valid relative URL
	log.Printf("PROCESSING: key: %s val: %s\n", attr.Key, attr.Val)
	_, err := url.Parse(fmt.Sprintf("http://localhost.com%s", attr.Val))
	if err != nil {
		log.Println(err)
		return
	}

	//log.Printf("BASEURL patch:  %s\n", baseURL)

	attr.Val = fmt.Sprintf(
		"/%s://%s/%s",
		baseURL.Scheme,
		baseURL.Host,
		strings.TrimPrefix(attr.Val, "/"),
	)
	attr.Val = url.QueryEscape(attr.Val)
	attr.Val = fmt.Sprintf("/%s", attr.Val)

	log.Printf("root rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}

// Document-relative URLs: These are relative to the current document's path and don't start with a "/".
func handleDocumentRelativePath(attr *html.Attribute, baseURL *url.URL) {
	attr.Val = fmt.Sprintf(
		"%s://%s/%s%s",
		baseURL.Scheme,
		strings.Trim(baseURL.Host, "/"),
		strings.Trim(baseURL.RawPath, "/"),
		strings.Trim(attr.Val, "/"),
	)
	attr.Val = url.QueryEscape(attr.Val)
	attr.Val = fmt.Sprintf("/%s", attr.Val)
	log.Printf("doc rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}

// Protocol-relative URLs: These start with "//" and will use the same protocol (http or https) as the current page.
func handleProtocolRelativePath(attr *html.Attribute, baseURL *url.URL) {
	attr.Val = strings.TrimPrefix(attr.Val, "/")
	handleRootRelativePath(attr, baseURL)
	log.Printf("proto rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}

func handleAbsolutePath(attr *html.Attribute, baseURL *url.URL) {
	// check if valid URL
	u, err := url.Parse(attr.Val)
	if err != nil {
		return
	}
	if !(u.Scheme == "http" || u.Scheme == "https") {
		return
	}
	attr.Val = fmt.Sprintf(
		"/%s",
		url.QueryEscape(
			strings.TrimPrefix(attr.Val, "/"),
		),
	)
	log.Printf("abs url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}

func handleSrcSet(attr *html.Attribute, baseURL *url.URL) {
	var srcSetBuilder strings.Builder
	srcSetItems := strings.Split(attr.Val, ",")

	for i, srcItem := range srcSetItems {
		srcParts := strings.Fields(srcItem) // Fields splits around whitespace, trimming them

		if len(srcParts) == 0 {
			continue // skip empty items
		}

		// Process URL part
		urlPart := processURLPart(srcParts[0], baseURL)

		// First srcset item without a descriptor
		if i == 0 && (len(srcParts) == 1 || !strings.HasSuffix(srcParts[1], "x")) {
			srcSetBuilder.WriteString(urlPart)
		} else {
			srcSetBuilder.WriteString(fmt.Sprintf("%s %s", urlPart, srcParts[1]))
		}

		if i < len(srcSetItems)-1 {
			srcSetBuilder.WriteString(",") // Add comma for all but last item
		}
	}

	attr.Val = srcSetBuilder.String()
	log.Printf("srcset url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}

// only for srcset
func processURLPart(urlPart string, baseURL *url.URL) string {
	f := &html.Attribute{Val: urlPart, Key: "src"}

	switch {
	case strings.HasPrefix(urlPart, "//"):
		handleProtocolRelativePath(f, baseURL)
	case strings.HasPrefix(urlPart, "/"):
		handleRootRelativePath(f, baseURL)
	case strings.HasPrefix(urlPart, "https://"), strings.HasPrefix(urlPart, "http://"):
		handleAbsolutePath(f, baseURL)
	default:
		handleDocumentRelativePath(f, baseURL)
	}

	return f.Val
}

func isBlackedlistedScheme(url string) bool {
	spl := strings.Split(url, ":")
	if len(spl) == 0 {
		return false
	}
	scheme := spl[0]
	return schemeBlacklist[scheme]
}

func patchResourceURL(token *html.Token, baseURL *url.URL, proxyURL string) {
	for i := range token.Attr {
		attr := &token.Attr[i]

		switch {
		// don't touch attributes except for the ones we defined
		case !attributesToRewrite[attr.Key]:
			continue
		// don't rewrite special URIs that don't make network requests
		case isBlackedlistedScheme(attr.Val):
			continue
		// don't double-overwrite the url
		case strings.HasPrefix(attr.Val, proxyURL):
			continue
		case attr.Key == "srcset":
			handleSrcSet(attr, baseURL)
			continue
		case strings.HasPrefix(attr.Val, "//"):
			handleProtocolRelativePath(attr, baseURL)
			continue
		case strings.HasPrefix(attr.Val, "/"):
			handleRootRelativePath(attr, baseURL)
			continue
		case strings.HasPrefix(attr.Val, "https://") || strings.HasPrefix(attr.Val, "http://"):
			handleAbsolutePath(attr, baseURL)
			continue
		default:
			handleDocumentRelativePath(attr, baseURL)
			continue
		}

	}
}