package responsemodifers
import (
"bytes"
"fmt"
"io"
"ladder/proxychain"
"net/url"
"strings"
"golang.org/x/net/html"
)
// Define list of HTML attributes to try to rewrite
var AttributesToRewrite map[string]bool
func init() {
AttributesToRewrite = map[string]bool{
"src": true,
"href": true,
"action": true,
"srcset": true, // TODO: fix
"poster": true,
"data": true,
"cite": true,
"formaction": true,
"background": true,
"usemap": true,
"longdesc": true,
"manifest": true,
"archive": true,
"codebase": true,
"icon": true,
"pluginspage": true,
}
}
// HTMLResourceURLRewriter is a struct that rewrites URLs within HTML resources to use a specified proxy URL.
// It uses an HTML tokenizer to process HTML content and rewrites URLs in src/href attributes.
//
->
type HTMLResourceURLRewriter struct {
baseURL *url.URL
tokenizer *html.Tokenizer
currentToken html.Token
tokenBuffer *bytes.Buffer
currentTokenIndex int
currentTokenProcessed bool
}
// NewHTMLResourceURLRewriter creates a new instance of HTMLResourceURLRewriter.
// It initializes the tokenizer with the provided source and sets the proxy URL.
func NewHTMLResourceURLRewriter(src io.ReadCloser, baseURL *url.URL) *HTMLResourceURLRewriter {
return &HTMLResourceURLRewriter{
tokenizer: html.NewTokenizer(src),
currentToken: html.Token{},
currentTokenIndex: 0,
tokenBuffer: new(bytes.Buffer),
baseURL: baseURL,
}
}
// Close resets the internal state of HTMLResourceURLRewriter, clearing buffers and token data.
func (r *HTMLResourceURLRewriter) Close() error {
r.tokenBuffer.Reset()
r.currentToken = html.Token{}
r.currentTokenIndex = 0
r.currentTokenProcessed = false
return nil
}
// Read processes the HTML content, rewriting URLs and managing the state of tokens.
// It reads HTML content, token by token, rewriting URLs to route through the specified proxy.
func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) {
if r.currentToken.Data == "" || r.currentTokenProcessed {
tokenType := r.tokenizer.Next()
// done reading html, close out reader
if tokenType == html.ErrorToken {
if r.tokenizer.Err() == io.EOF {
return 0, io.EOF
}
return 0, r.tokenizer.Err()
}
// flush the current token into an internal buffer
// to handle fragmented tokens
r.currentToken = r.tokenizer.Token()
// patch tokens with URLs
isTokenWithAttribute := r.currentToken.Type == html.StartTagToken || r.currentToken.Type == html.SelfClosingTagToken
if isTokenWithAttribute {
patchResourceURL(&r.currentToken, r.baseURL)
}
r.tokenBuffer.Reset()
r.tokenBuffer.WriteString(r.currentToken.String())
r.currentTokenProcessed = false
r.currentTokenIndex = 0
}
n, err := r.tokenBuffer.Read(p)
if err == io.EOF || r.tokenBuffer.Len() == 0 {
r.currentTokenProcessed = true
err = nil // EOF in this context is expected and not an actual error
}
return n, err
}
// Root-relative URLs: These are relative to the root path and start with a "/".
func handleRootRelativePath(attr *html.Attribute, baseURL *url.URL) {
// doublecheck this is a valid relative URL
_, err := url.Parse(fmt.Sprintf("http://localhost.com%s", attr.Val))
if err != nil {
return
}
//log.Printf("BASEURL patch: %s\n", baseURL)
attr.Val = fmt.Sprintf(
"/%s://%s/%s",
baseURL.Scheme,
baseURL.Host,
strings.TrimPrefix(attr.Val, "/"),
)
attr.Val = url.QueryEscape(attr.Val)
attr.Val = fmt.Sprintf("/%s", attr.Val)
//log.Printf("root rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
// Document-relative URLs: These are relative to the current document's path and don't start with a "/".
func handleDocumentRelativePath(attr *html.Attribute, baseURL *url.URL) {
attr.Val = fmt.Sprintf(
"%s://%s/%s%s",
baseURL.Scheme,
strings.Trim(baseURL.Host, "/"),
strings.Trim(baseURL.RawPath, "/"),
strings.Trim(attr.Val, "/"),
)
attr.Val = url.QueryEscape(attr.Val)
attr.Val = fmt.Sprintf("/%s", attr.Val)
//log.Printf("doc rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
// Protocol-relative URLs: These start with "//" and will use the same protocol (http or https) as the current page.
func handleProtocolRelativePath(attr *html.Attribute, baseURL *url.URL) {
attr.Val = strings.TrimPrefix(attr.Val, "/")
handleRootRelativePath(attr, baseURL)
//log.Printf("proto rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
func handleAbsolutePath(attr *html.Attribute, baseURL *url.URL) {
// check if valid URL
u, err := url.Parse(attr.Val)
if err != nil {
return
}
if !(u.Scheme == "http" || u.Scheme == "https") {
return
}
attr.Val = fmt.Sprintf(
"/%s",
url.QueryEscape(
strings.TrimPrefix(attr.Val, "/"),
),
)
//log.Printf("abs url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
func handleSrcSet(attr *html.Attribute, baseURL *url.URL) {
for i, src := range strings.Split(attr.Val, ",") {
src = strings.Trim(src, " ")
for j, s := range strings.Split(src, " ") {
s = strings.Trim(s, " ")
if j == 0 {
f := &html.Attribute{Val: s, Key: attr.Key}
switch {
case strings.HasPrefix(s, "//"):
handleProtocolRelativePath(f, baseURL)
case strings.HasPrefix(s, "/"):
handleRootRelativePath(f, baseURL)
case strings.HasPrefix(s, "https://") || strings.HasPrefix(s, "http://"):
handleAbsolutePath(f, baseURL)
default:
handleDocumentRelativePath(f, baseURL)
}
s = f.Val
}
if i == 0 && j == 0 {
attr.Val = s
continue
}
attr.Val = fmt.Sprintf("%s %s", attr.Val, s)
}
attr.Val = fmt.Sprintf("%s,", attr.Val)
}
attr.Val = strings.TrimSuffix(attr.Val, ",")
}
// TODO: figure out how to handle these
// srcset
func patchResourceURL(token *html.Token, baseURL *url.URL) {
for i := range token.Attr {
attr := &token.Attr[i]
switch {
// dont touch attributes except for the ones we defined
case !AttributesToRewrite[attr.Key]:
continue
case attr.Key == "srcset":
handleSrcSet(attr, baseURL)
continue
case strings.HasPrefix(attr.Val, "//"):
handleProtocolRelativePath(attr, baseURL)
continue
case strings.HasPrefix(attr.Val, "/"):
handleRootRelativePath(attr, baseURL)
continue
case strings.HasPrefix(attr.Val, "https://") || strings.HasPrefix(attr.Val, "http://"):
handleAbsolutePath(attr, baseURL)
continue
default:
handleDocumentRelativePath(attr, baseURL)
continue
}
}
}
// RewriteHTMLResourceURLs modifies HTTP responses
// to rewrite URLs attributes in HTML content (such as src, href)
// - `
` -> `
`
// - This function is designed to allow the proxified page
// to still be browsible by routing all resource URLs through the proxy.
//
// ---
//
// - It works by replacing the io.ReadCloser of the http.Response.Body
// with another io.ReaderCloser (HTMLResourceRewriter) that wraps the first one.
//
// - This process can be done multiple times, so that the response will
// be streamed and modified through each pass without buffering the entire response in memory.
//
// - HTMLResourceRewriter reads the http.Response.Body stream,
// parsing each HTML token one at a time and replacing attribute tags.
//
// - When ProxyChain.Execute() is called, the response body will be read from the server
// and pulled through each ResponseModification which wraps the ProxyChain.Response.Body
// without ever buffering the entire HTTP response in memory.
func RewriteHTMLResourceURLs() proxychain.ResponseModification {
return func(chain *proxychain.ProxyChain) error {
// return early if it's not HTML
ct := chain.Response.Header.Get("content-type")
if !strings.HasPrefix(ct, "text/html") {
return nil
}
chain.Response.Body = NewHTMLResourceURLRewriter(chain.Response.Body, chain.Request.URL)
return nil
}
}