refactor rewriters to modify html in single pass with multiple rewriters; improve html rewriter edge case handling

This commit is contained in:
Kevin Pham
2023-11-22 23:51:52 -06:00
parent 7668713b1a
commit 5d55a2f3f0
11 changed files with 639 additions and 435 deletions

View File

@@ -13,24 +13,9 @@ import (
// - `<img src='/relative_path'>` -> `<img src='/https://proxiedsite.com/relative_path'>`
// - This function is designed to allow the proxified page
// to still be browsible by routing all resource URLs through the proxy.
//
// ---
//
// - It works by replacing the io.ReadCloser of the http.Response.Body
// with another io.ReaderCloser (HTMLResourceRewriter) that wraps the first one.
//
// - This process can be done multiple times, so that the response will
// be streamed and modified through each pass without buffering the entire response in memory.
//
// - HTMLResourceRewriter reads the http.Response.Body stream,
// parsing each HTML token one at a time and replacing attribute tags.
//
// - When ProxyChain.Execute() is called, the response body will be read from the server
// and pulled through each ResponseModification which wraps the ProxyChain.Response.Body
// without ever buffering the entire HTTP response in memory.
func RewriteHTMLResourceURLs() proxychain.ResponseModification {
return func(chain *proxychain.ProxyChain) error {
// return early if it's not HTML
// don't add rewriter if it's not even html
ct := chain.Response.Header.Get("content-type")
if !strings.HasPrefix(ct, "text/html") {
return nil
@@ -40,12 +25,10 @@ func RewriteHTMLResourceURLs() proxychain.ResponseModification {
originalURI := chain.Context.Request().URI()
proxyURL := fmt.Sprintf("%s://%s", originalURI.Scheme(), originalURI.Host())
chain.Response.Body = rewriters.
NewHTMLResourceURLRewriter(
chain.Response.Body,
chain.Request.URL,
proxyURL,
)
// the rewriting actually happens in chain.Execute() as the client is streaming the response body back
rr := rewriters.NewHTMLTokenURLRewriter(chain.Request.URL, proxyURL)
// we just queue it up here
chain.AddHTMLTokenRewriter(rr)
return nil
}