refactor rewriters to modify html in single pass with multiple rewriters; improve html rewriter edge case handling

2023-11-22 23:51:52 -06:00
parent 7668713b1a
commit 5d55a2f3f0
11 changed files with 639 additions and 435 deletions
--- a/proxychain/responsemodifers/rewriters/html_rewriter.go
+++ b/proxychain/responsemodifers/rewriters/html_rewriter.go
@@ -0,0 +1,131 @@
+package rewriters
+
+import (
+	"bytes"
+	"io"
+
+	"golang.org/x/net/html"
+)
+
+// IHTMLTokenRewriter defines an interface for modifying HTML tokens.
+type IHTMLTokenRewriter interface {
+	// ShouldModify determines whether a given HTML token requires modification.
+	ShouldModify(*html.Token) bool
+
+	// ModifyToken applies modifications to a given HTML token.
+	// It returns strings representing content to be prepended and
+	// appended to the token. If no modifications are required or if an error occurs,
+	// it returns empty strings for both 'prepend' and 'append'.
+	// Note: The original token is not modified if an error occurs.
+	ModifyToken(*html.Token) (prepend, append string)
+}
+
+// HTMLRewriter is a struct that can take multiple TokenHandlers and process all
+// HTML tokens from http.Response.Body in a single pass, making changes and returning a new io.ReadCloser
+//
+//   - HTMLRewriter reads the http.Response.Body stream,
+//     parsing each HTML token one at a time and making modifications (defined by implementations of IHTMLTokenRewriter)
+//     in a single pass of the tokenizer.
+//
+//   - When ProxyChain.Execute() is called, the response body will be read from the server
+//     and pulled through each ResponseModification which wraps the ProxyChain.Response.Body
+//     without ever buffering the entire HTTP response in memory.
+type HTMLRewriter struct {
+	tokenizer             *html.Tokenizer
+	currentToken          *html.Token
+	tokenBuffer           *bytes.Buffer
+	currentTokenProcessed bool
+	rewriters             []IHTMLTokenRewriter
+}
+
+// NewHTMLRewriter creates a new HTMLRewriter instance.
+// It processes HTML tokens from an io.ReadCloser source (typically http.Response.Body)
+// using a series of HTMLTokenRewriters. Each HTMLTokenRewriter in the 'rewriters' slice
+// applies its specific modifications to the HTML tokens.
+// The HTMLRewriter reads from the provided 'src', applies the modifications,
+// and returns the processed content as a new io.ReadCloser.
+// This new io.ReadCloser can be used to stream the modified content back to the client.
+//
+// Parameters:
+//   - src: An io.ReadCloser representing the source of the HTML content, such as http.Response.Body.
+//   - rewriters: A slice of HTMLTokenRewriters that define the modifications to be applied to the HTML tokens.
+//
+// Returns:
+//   - A pointer to an HTMLRewriter, which implements io.ReadCloser, containing the modified HTML content.
+func NewHTMLRewriter(src io.ReadCloser, rewriters []IHTMLTokenRewriter) *HTMLRewriter {
+	return &HTMLRewriter{
+		tokenizer:             html.NewTokenizer(src),
+		currentToken:          nil,
+		tokenBuffer:           new(bytes.Buffer),
+		currentTokenProcessed: false,
+		rewriters:             rewriters,
+	}
+}
+
+// Close resets the internal state of HTMLRewriter, clearing buffers and token data.
+func (r *HTMLRewriter) Close() error {
+	r.tokenBuffer.Reset()
+	r.currentToken = nil
+	r.currentTokenProcessed = false
+	return nil
+}
+
+// Read processes the HTML content, rewriting URLs and managing the state of tokens.
+func (r *HTMLRewriter) Read(p []byte) (int, error) {
+
+	if r.currentToken == nil || r.currentToken.Data == "" || r.currentTokenProcessed {
+		tokenType := r.tokenizer.Next()
+
+		// done reading html, close out reader
+		if tokenType == html.ErrorToken {
+			if r.tokenizer.Err() == io.EOF {
+				return 0, io.EOF
+			}
+			return 0, r.tokenizer.Err()
+		}
+
+		// get the next token; reset buffer
+		t := r.tokenizer.Token()
+		r.currentToken = &t
+		r.tokenBuffer.Reset()
+
+		// buffer += "<prepends> <token> <appends>"
+		// process token through all registered rewriters
+		// rewriters will modify the token, and optionally
+		// return a <prepend> or <append> string token
+		appends := make([]string, 0, len(r.rewriters))
+		for _, rewriter := range r.rewriters {
+			if !rewriter.ShouldModify(r.currentToken) {
+				continue
+			}
+			prepend, a := rewriter.ModifyToken(r.currentToken)
+			appends = append(appends, a)
+			// add <prepends> to buffer
+			r.tokenBuffer.WriteString(prepend)
+		}
+
+		// add <token> to buffer
+		if tokenType == html.TextToken {
+			// don't unescape textTokens (such as inline scripts).
+			// Token.String() by default will escape the inputs, but
+			// we don't want to modify the original source
+			r.tokenBuffer.WriteString(r.currentToken.Data)
+		} else {
+			r.tokenBuffer.WriteString(r.currentToken.String())
+		}
+
+		// add <appends> to buffer
+		for _, a := range appends {
+			r.tokenBuffer.WriteString(a)
+		}
+
+		r.currentTokenProcessed = false
+	}
+
+	n, err := r.tokenBuffer.Read(p)
+	if err == io.EOF || r.tokenBuffer.Len() == 0 {
+		r.currentTokenProcessed = true
+		err = nil // EOF in this context is expected and not an actual error
+	}
+	return n, err
+}