refactor rewriters to modify html in single pass with multiple rewriters; improve html rewriter edge case handling
This commit is contained in:
131
proxychain/responsemodifers/rewriters/html_rewriter.go
Normal file
131
proxychain/responsemodifers/rewriters/html_rewriter.go
Normal file
@@ -0,0 +1,131 @@
|
||||
package rewriters
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// IHTMLTokenRewriter defines an interface for modifying HTML tokens.
|
||||
type IHTMLTokenRewriter interface {
|
||||
// ShouldModify determines whether a given HTML token requires modification.
|
||||
ShouldModify(*html.Token) bool
|
||||
|
||||
// ModifyToken applies modifications to a given HTML token.
|
||||
// It returns strings representing content to be prepended and
|
||||
// appended to the token. If no modifications are required or if an error occurs,
|
||||
// it returns empty strings for both 'prepend' and 'append'.
|
||||
// Note: The original token is not modified if an error occurs.
|
||||
ModifyToken(*html.Token) (prepend, append string)
|
||||
}
|
||||
|
||||
// HTMLRewriter is a struct that can take multiple TokenHandlers and process all
|
||||
// HTML tokens from http.Response.Body in a single pass, making changes and returning a new io.ReadCloser
|
||||
//
|
||||
// - HTMLRewriter reads the http.Response.Body stream,
|
||||
// parsing each HTML token one at a time and making modifications (defined by implementations of IHTMLTokenRewriter)
|
||||
// in a single pass of the tokenizer.
|
||||
//
|
||||
// - When ProxyChain.Execute() is called, the response body will be read from the server
|
||||
// and pulled through each ResponseModification which wraps the ProxyChain.Response.Body
|
||||
// without ever buffering the entire HTTP response in memory.
|
||||
type HTMLRewriter struct {
|
||||
tokenizer *html.Tokenizer
|
||||
currentToken *html.Token
|
||||
tokenBuffer *bytes.Buffer
|
||||
currentTokenProcessed bool
|
||||
rewriters []IHTMLTokenRewriter
|
||||
}
|
||||
|
||||
// NewHTMLRewriter creates a new HTMLRewriter instance.
|
||||
// It processes HTML tokens from an io.ReadCloser source (typically http.Response.Body)
|
||||
// using a series of HTMLTokenRewriters. Each HTMLTokenRewriter in the 'rewriters' slice
|
||||
// applies its specific modifications to the HTML tokens.
|
||||
// The HTMLRewriter reads from the provided 'src', applies the modifications,
|
||||
// and returns the processed content as a new io.ReadCloser.
|
||||
// This new io.ReadCloser can be used to stream the modified content back to the client.
|
||||
//
|
||||
// Parameters:
|
||||
// - src: An io.ReadCloser representing the source of the HTML content, such as http.Response.Body.
|
||||
// - rewriters: A slice of HTMLTokenRewriters that define the modifications to be applied to the HTML tokens.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to an HTMLRewriter, which implements io.ReadCloser, containing the modified HTML content.
|
||||
func NewHTMLRewriter(src io.ReadCloser, rewriters []IHTMLTokenRewriter) *HTMLRewriter {
|
||||
return &HTMLRewriter{
|
||||
tokenizer: html.NewTokenizer(src),
|
||||
currentToken: nil,
|
||||
tokenBuffer: new(bytes.Buffer),
|
||||
currentTokenProcessed: false,
|
||||
rewriters: rewriters,
|
||||
}
|
||||
}
|
||||
|
||||
// Close resets the internal state of HTMLRewriter, clearing buffers and token data.
|
||||
func (r *HTMLRewriter) Close() error {
|
||||
r.tokenBuffer.Reset()
|
||||
r.currentToken = nil
|
||||
r.currentTokenProcessed = false
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read processes the HTML content, rewriting URLs and managing the state of tokens.
|
||||
func (r *HTMLRewriter) Read(p []byte) (int, error) {
|
||||
|
||||
if r.currentToken == nil || r.currentToken.Data == "" || r.currentTokenProcessed {
|
||||
tokenType := r.tokenizer.Next()
|
||||
|
||||
// done reading html, close out reader
|
||||
if tokenType == html.ErrorToken {
|
||||
if r.tokenizer.Err() == io.EOF {
|
||||
return 0, io.EOF
|
||||
}
|
||||
return 0, r.tokenizer.Err()
|
||||
}
|
||||
|
||||
// get the next token; reset buffer
|
||||
t := r.tokenizer.Token()
|
||||
r.currentToken = &t
|
||||
r.tokenBuffer.Reset()
|
||||
|
||||
// buffer += "<prepends> <token> <appends>"
|
||||
// process token through all registered rewriters
|
||||
// rewriters will modify the token, and optionally
|
||||
// return a <prepend> or <append> string token
|
||||
appends := make([]string, 0, len(r.rewriters))
|
||||
for _, rewriter := range r.rewriters {
|
||||
if !rewriter.ShouldModify(r.currentToken) {
|
||||
continue
|
||||
}
|
||||
prepend, a := rewriter.ModifyToken(r.currentToken)
|
||||
appends = append(appends, a)
|
||||
// add <prepends> to buffer
|
||||
r.tokenBuffer.WriteString(prepend)
|
||||
}
|
||||
|
||||
// add <token> to buffer
|
||||
if tokenType == html.TextToken {
|
||||
// don't unescape textTokens (such as inline scripts).
|
||||
// Token.String() by default will escape the inputs, but
|
||||
// we don't want to modify the original source
|
||||
r.tokenBuffer.WriteString(r.currentToken.Data)
|
||||
} else {
|
||||
r.tokenBuffer.WriteString(r.currentToken.String())
|
||||
}
|
||||
|
||||
// add <appends> to buffer
|
||||
for _, a := range appends {
|
||||
r.tokenBuffer.WriteString(a)
|
||||
}
|
||||
|
||||
r.currentTokenProcessed = false
|
||||
}
|
||||
|
||||
n, err := r.tokenBuffer.Read(p)
|
||||
if err == io.EOF || r.tokenBuffer.Len() == 0 {
|
||||
r.currentTokenProcessed = true
|
||||
err = nil // EOF in this context is expected and not an actual error
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
Reference in New Issue
Block a user