From 1d88f14de205527cde3a32483fad697268fd6e7f Mon Sep 17 00:00:00 2001 From: Kevin Pham Date: Mon, 20 Nov 2023 11:38:53 -0600 Subject: [PATCH] rewrite resource URLs based on html tokenizer instead of regex --- handlers/proxy.go | 19 ++-- proxychain/proxychain.go | 59 +++++++--- .../rewrite_http_resource_urls.go | 104 +++++++++++------- 3 files changed, 121 insertions(+), 61 deletions(-) diff --git a/handlers/proxy.go b/handlers/proxy.go index fc3beea..70e7577 100644 --- a/handlers/proxy.go +++ b/handlers/proxy.go @@ -49,17 +49,18 @@ func NewProxySiteHandler(opts *ProxyOptions) fiber.Handler { rs = r } */ - proxychain := proxychain. - NewProxyChain(). - SetDebugLogging(opts.Verbose). - SetRequestModifications( - rx.DeleteOutgoingCookies(), - ). - AddResponseModifications( - tx.DeleteIncomingCookies(), - ) return func(c *fiber.Ctx) error { + proxychain := proxychain. + NewProxyChain(). + SetDebugLogging(opts.Verbose). + SetRequestModifications( + rx.DeleteOutgoingCookies(), + ). + AddResponseModifications( + tx.DeleteIncomingCookies(), + tx.RewriteHTMLResourceURLs(), + ) return proxychain.SetFiberCtx(c).Execute() } diff --git a/proxychain/proxychain.go b/proxychain/proxychain.go index b4c3613..a6f5948 100644 --- a/proxychain/proxychain.go +++ b/proxychain/proxychain.go @@ -83,7 +83,6 @@ type ProxyChain struct { Client *http.Client Request *http.Request Response *http.Response - Body io.Reader requestModifications []RequestModification resultModifications []ResponseModification Ruleset *ruleset.RuleSet @@ -131,11 +130,17 @@ func (chain *ProxyChain) AddRuleset(rs *ruleset.RuleSet) *ProxyChain { } func (chain *ProxyChain) _initialize_request() (*http.Request, error) { + log.Println("ir 1") + if chain.Context == nil { + chain.abortErr = chain.abort(errors.New("no context set")) + return nil, chain.abortErr + } // initialize a request (without url) req, err := http.NewRequest(chain.Context.Method(), "", nil) if err != nil { return nil, err } + log.Println("ir 2") chain.Request = req switch chain.Context.Method() { case "GET": @@ -152,12 +157,16 @@ func (chain *ProxyChain) _initialize_request() (*http.Request, error) { return nil, fmt.Errorf("unsupported request method from client: '%s'", chain.Context.Method()) } - // copy client request headers to upstream request headers - forwardHeaders := func(key []byte, val []byte) { - req.Header.Set(string(key), string(val)) - } - clientHeaders := &chain.Context.Request().Header - clientHeaders.VisitAll(forwardHeaders) + log.Println("ir 3") + /* + // copy client request headers to upstream request headers + forwardHeaders := func(key []byte, val []byte) { + req.Header.Set(string(key), string(val)) + } + clientHeaders := &chain.Context.Request().Header + clientHeaders.VisitAll(forwardHeaders) + */ + log.Println("ir 4") return req, nil } @@ -166,20 +175,27 @@ func (chain *ProxyChain) _initialize_request() (*http.Request, error) { // the caller is responsible for returning a response back to the requestor // the caller is also responsible for calling chain._reset() when they are done with the body func (chain *ProxyChain) _execute() (io.Reader, error) { - if chain.validateCtxIsSet() != nil { + if chain.validateCtxIsSet() != nil || chain.abortErr != nil { return nil, chain.abortErr } + if chain.Request == nil { + return nil, errors.New("proxychain request not yet initialized") + } if chain.Request.URL.Scheme == "" { return nil, errors.New("request url not set or invalid. Check ProxyChain ReqMods for issues") } + log.Println("A") // Apply requestModifications to proxychain for _, applyRequestModificationsTo := range chain.requestModifications { + log.Println("AA") + log.Println(applyRequestModificationsTo) err := applyRequestModificationsTo(chain) if err != nil { return nil, chain.abort(err) } } + log.Println("B") // Send Request Upstream resp, err := chain.Client.Do(chain.Request) @@ -187,9 +203,9 @@ func (chain *ProxyChain) _execute() (io.Reader, error) { return nil, chain.abort(err) } chain.Response = resp - chain.Body = chain.Response.Body + log.Println("C") - defer resp.Body.Close() + //defer resp.Body.Close() /* todo: move to rsm for k, v := range resp.Header { @@ -204,8 +220,9 @@ func (chain *ProxyChain) _execute() (io.Reader, error) { return nil, chain.abort(err) } } + log.Println("D") - return chain.Body, nil + return chain.Response.Body, nil } // Execute sends the request for the ProxyChain and returns the request to the sender @@ -214,12 +231,23 @@ func (chain *ProxyChain) _execute() (io.Reader, error) { // be returned to the client func (chain *ProxyChain) Execute() error { defer chain._reset() + log.Println("1") body, err := chain._execute() + log.Println("2") if err != nil { + log.Println(err) return err } + log.Println("3") + log.Println(chain) + if chain.Context == nil { + return errors.New("no context set") + } // Return request back to client + chain.Context.Set("content-type", "text/html") return chain.Context.SendStream(body) + + //return chain.Context.SendStream(body) } // reconstructUrlFromReferer reconstructs the URL using the referer's scheme, host, and the relative path / queries @@ -232,9 +260,11 @@ func reconstructUrlFromReferer(referer *url.URL, relativeUrl *url.URL) (*url.URL } if realUrl.Scheme == "" || realUrl.Host == "" { - return nil, fmt.Errorf("invalid referer URL: %s", referer) + return nil, fmt.Errorf("invalid referer URL: '%s' on request '%s", referer, relativeUrl) } + log.Printf("'%s' -> '%s'\n", relativeUrl.String(), realUrl.String()) + return &url.URL{ Scheme: referer.Scheme, Host: referer.Host, @@ -251,11 +281,13 @@ func (chain *ProxyChain) extractUrl() (*url.URL, error) { if err != nil { reqUrl = chain.Context.Params("*") // fallback } + fmt.Println(reqUrl) urlQuery, err := url.Parse(reqUrl) if err != nil { return nil, fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err) } + fmt.Println(urlQuery) // Handle standard paths // eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg @@ -338,9 +370,8 @@ func (chain *ProxyChain) abort(err error) error { // internal function to reset state of ProxyChain for reuse func (chain *ProxyChain) _reset() { chain.abortErr = nil - chain.Body = nil chain.Request = nil - chain.Response = nil + //chain.Response = nil chain.Context = nil } diff --git a/proxychain/responsemodifers/rewrite_http_resource_urls.go b/proxychain/responsemodifers/rewrite_http_resource_urls.go index d572a5e..1e051a2 100644 --- a/proxychain/responsemodifers/rewrite_http_resource_urls.go +++ b/proxychain/responsemodifers/rewrite_http_resource_urls.go @@ -4,6 +4,7 @@ import ( "bytes" "io" "ladder/proxychain" + "log" "net/url" "strings" @@ -11,67 +12,94 @@ import ( ) type HTMLResourceURLRewriter struct { - src io.Reader - buffer *bytes.Buffer // buffer to temporarily hold rewritten output for the reader - proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL + proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL; TODO: implement + tokenizer *html.Tokenizer + currentToken html.Token + tokenBuffer *bytes.Buffer + currentTokenIndex int + currentTokenProcessed bool } -func NewHTMLResourceURLRewriter(src io.Reader, proxyURL *url.URL) *HTMLResourceURLRewriter { +func NewHTMLResourceURLRewriter(src io.ReadCloser, proxyURL *url.URL) *HTMLResourceURLRewriter { + log.Println("tokenize") return &HTMLResourceURLRewriter{ - src: src, - buffer: new(bytes.Buffer), - proxyURL: proxyURL, + tokenizer: html.NewTokenizer(src), + currentToken: html.Token{}, + currentTokenIndex: 0, + tokenBuffer: new(bytes.Buffer), + proxyURL: proxyURL, } } -func rewriteToken(token *html.Token, baseURL *url.URL) { - attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true} - for i := range token.Attr { - attr := &token.Attr[i] - if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") { - // Make URL absolute - attr.Val = "/https://" + baseURL.Host + attr.Val - } - } +func (r *HTMLResourceURLRewriter) Close() error { + r.tokenBuffer.Reset() + r.currentToken = html.Token{} + r.currentTokenIndex = 0 + r.currentTokenProcessed = false + return nil } func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) { - if r.buffer.Len() != 0 { - return r.buffer.Read(p) - } - tokenizer := html.NewTokenizer(r.src) - for { - tokenType := tokenizer.Next() + if r.currentToken.Data == "" || r.currentTokenProcessed { + tokenType := r.tokenizer.Next() + + // done reading html, close out reader if tokenType == html.ErrorToken { - err := tokenizer.Err() - if err == io.EOF { - return 0, io.EOF // End of document + if r.tokenizer.Err() == io.EOF { + return 0, io.EOF } - return 0, err // Actual error + return 0, r.tokenizer.Err() } - token := tokenizer.Token() - if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken { - rewriteToken(&token, r.proxyURL) - } - r.buffer.WriteString(token.String()) - if r.buffer.Len() > 0 { - break - } + // flush the current token into an internal buffer + // to handle fragmented tokens + r.currentToken = r.tokenizer.Token() + r.tokenBuffer.Reset() + r.tokenBuffer.WriteString(r.currentToken.String()) + r.currentTokenProcessed = false + r.currentTokenIndex = 0 } - return r.buffer.Read(p) + + n, err := r.tokenBuffer.Read(p) + + if err == io.EOF || r.tokenBuffer.Len() == 0 { + r.currentTokenProcessed = true + err = nil // Reset error to nil because EOF in this context is expected and not an actual error + } + return n, err + } // RewriteHTMLResourceURLs updates src/href attributes in HTML content to route through the proxy. func RewriteHTMLResourceURLs() proxychain.ResponseModification { return func(chain *proxychain.ProxyChain) error { + log.Println("rhru") ct := chain.Response.Header.Get("content-type") - if ct != "text/html" { + log.Println(ct) + if !strings.HasPrefix(ct, "text/html") { return nil } - // TODO: implement chaining rewriter chaining method - // so we can compose multiple body rewriters together + log.Println("rhru2") + // chain.Response.Body is an unread http.Response.Body + chain.Response.Body = NewHTMLResourceURLRewriter(chain.Response.Body, chain.Request.URL) return nil } } + +func rewriteToken(token *html.Token, baseURL *url.URL) { + log.Println(token.String()) + attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true} + for i := range token.Attr { + attr := &token.Attr[i] + if attrsToRewrite[attr.Key] { + attr.Val = "/" + attr.Val + } + /* + if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") { + // Make URL absolute + attr.Val = "/https://" + baseURL.Host + attr.Val + } + */ + } +}