rewrite resource URLs based on html tokenizer instead of regex

This commit is contained in:
Kevin Pham
2023-11-20 11:38:53 -06:00
parent 5035f65d6b
commit 1d88f14de2
3 changed files with 121 additions and 61 deletions

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"io"
"ladder/proxychain"
"log"
"net/url"
"strings"
@@ -11,67 +12,94 @@ import (
)
type HTMLResourceURLRewriter struct {
src io.Reader
buffer *bytes.Buffer // buffer to temporarily hold rewritten output for the reader
proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL
proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL; TODO: implement
tokenizer *html.Tokenizer
currentToken html.Token
tokenBuffer *bytes.Buffer
currentTokenIndex int
currentTokenProcessed bool
}
func NewHTMLResourceURLRewriter(src io.Reader, proxyURL *url.URL) *HTMLResourceURLRewriter {
func NewHTMLResourceURLRewriter(src io.ReadCloser, proxyURL *url.URL) *HTMLResourceURLRewriter {
log.Println("tokenize")
return &HTMLResourceURLRewriter{
src: src,
buffer: new(bytes.Buffer),
proxyURL: proxyURL,
tokenizer: html.NewTokenizer(src),
currentToken: html.Token{},
currentTokenIndex: 0,
tokenBuffer: new(bytes.Buffer),
proxyURL: proxyURL,
}
}
func rewriteToken(token *html.Token, baseURL *url.URL) {
attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true}
for i := range token.Attr {
attr := &token.Attr[i]
if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") {
// Make URL absolute
attr.Val = "/https://" + baseURL.Host + attr.Val
}
}
func (r *HTMLResourceURLRewriter) Close() error {
r.tokenBuffer.Reset()
r.currentToken = html.Token{}
r.currentTokenIndex = 0
r.currentTokenProcessed = false
return nil
}
func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) {
if r.buffer.Len() != 0 {
return r.buffer.Read(p)
}
tokenizer := html.NewTokenizer(r.src)
for {
tokenType := tokenizer.Next()
if r.currentToken.Data == "" || r.currentTokenProcessed {
tokenType := r.tokenizer.Next()
// done reading html, close out reader
if tokenType == html.ErrorToken {
err := tokenizer.Err()
if err == io.EOF {
return 0, io.EOF // End of document
if r.tokenizer.Err() == io.EOF {
return 0, io.EOF
}
return 0, err // Actual error
return 0, r.tokenizer.Err()
}
token := tokenizer.Token()
if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
rewriteToken(&token, r.proxyURL)
}
r.buffer.WriteString(token.String())
if r.buffer.Len() > 0 {
break
}
// flush the current token into an internal buffer
// to handle fragmented tokens
r.currentToken = r.tokenizer.Token()
r.tokenBuffer.Reset()
r.tokenBuffer.WriteString(r.currentToken.String())
r.currentTokenProcessed = false
r.currentTokenIndex = 0
}
return r.buffer.Read(p)
n, err := r.tokenBuffer.Read(p)
if err == io.EOF || r.tokenBuffer.Len() == 0 {
r.currentTokenProcessed = true
err = nil // Reset error to nil because EOF in this context is expected and not an actual error
}
return n, err
}
// RewriteHTMLResourceURLs updates src/href attributes in HTML content to route through the proxy.
func RewriteHTMLResourceURLs() proxychain.ResponseModification {
return func(chain *proxychain.ProxyChain) error {
log.Println("rhru")
ct := chain.Response.Header.Get("content-type")
if ct != "text/html" {
log.Println(ct)
if !strings.HasPrefix(ct, "text/html") {
return nil
}
// TODO: implement chaining rewriter chaining method
// so we can compose multiple body rewriters together
log.Println("rhru2")
// chain.Response.Body is an unread http.Response.Body
chain.Response.Body = NewHTMLResourceURLRewriter(chain.Response.Body, chain.Request.URL)
return nil
}
}
func rewriteToken(token *html.Token, baseURL *url.URL) {
log.Println(token.String())
attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true}
for i := range token.Attr {
attr := &token.Attr[i]
if attrsToRewrite[attr.Key] {
attr.Val = "/" + attr.Val
}
/*
if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") {
// Make URL absolute
attr.Val = "/https://" + baseURL.Host + attr.Val
}
*/
}
}