rewrite resource URLs based on html tokenizer instead of regex

This commit is contained in:
Kevin Pham
2023-11-20 11:38:53 -06:00
parent 5035f65d6b
commit 1d88f14de2
3 changed files with 121 additions and 61 deletions

View File

@@ -49,6 +49,8 @@ func NewProxySiteHandler(opts *ProxyOptions) fiber.Handler {
rs = r
}
*/
return func(c *fiber.Ctx) error {
proxychain := proxychain.
NewProxyChain().
SetDebugLogging(opts.Verbose).
@@ -57,9 +59,8 @@ func NewProxySiteHandler(opts *ProxyOptions) fiber.Handler {
).
AddResponseModifications(
tx.DeleteIncomingCookies(),
tx.RewriteHTMLResourceURLs(),
)
return func(c *fiber.Ctx) error {
return proxychain.SetFiberCtx(c).Execute()
}

View File

@@ -83,7 +83,6 @@ type ProxyChain struct {
Client *http.Client
Request *http.Request
Response *http.Response
Body io.Reader
requestModifications []RequestModification
resultModifications []ResponseModification
Ruleset *ruleset.RuleSet
@@ -131,11 +130,17 @@ func (chain *ProxyChain) AddRuleset(rs *ruleset.RuleSet) *ProxyChain {
}
func (chain *ProxyChain) _initialize_request() (*http.Request, error) {
log.Println("ir 1")
if chain.Context == nil {
chain.abortErr = chain.abort(errors.New("no context set"))
return nil, chain.abortErr
}
// initialize a request (without url)
req, err := http.NewRequest(chain.Context.Method(), "", nil)
if err != nil {
return nil, err
}
log.Println("ir 2")
chain.Request = req
switch chain.Context.Method() {
case "GET":
@@ -152,12 +157,16 @@ func (chain *ProxyChain) _initialize_request() (*http.Request, error) {
return nil, fmt.Errorf("unsupported request method from client: '%s'", chain.Context.Method())
}
log.Println("ir 3")
/*
// copy client request headers to upstream request headers
forwardHeaders := func(key []byte, val []byte) {
req.Header.Set(string(key), string(val))
}
clientHeaders := &chain.Context.Request().Header
clientHeaders.VisitAll(forwardHeaders)
*/
log.Println("ir 4")
return req, nil
}
@@ -166,20 +175,27 @@ func (chain *ProxyChain) _initialize_request() (*http.Request, error) {
// the caller is responsible for returning a response back to the requestor
// the caller is also responsible for calling chain._reset() when they are done with the body
func (chain *ProxyChain) _execute() (io.Reader, error) {
if chain.validateCtxIsSet() != nil {
if chain.validateCtxIsSet() != nil || chain.abortErr != nil {
return nil, chain.abortErr
}
if chain.Request == nil {
return nil, errors.New("proxychain request not yet initialized")
}
if chain.Request.URL.Scheme == "" {
return nil, errors.New("request url not set or invalid. Check ProxyChain ReqMods for issues")
}
log.Println("A")
// Apply requestModifications to proxychain
for _, applyRequestModificationsTo := range chain.requestModifications {
log.Println("AA")
log.Println(applyRequestModificationsTo)
err := applyRequestModificationsTo(chain)
if err != nil {
return nil, chain.abort(err)
}
}
log.Println("B")
// Send Request Upstream
resp, err := chain.Client.Do(chain.Request)
@@ -187,9 +203,9 @@ func (chain *ProxyChain) _execute() (io.Reader, error) {
return nil, chain.abort(err)
}
chain.Response = resp
chain.Body = chain.Response.Body
log.Println("C")
defer resp.Body.Close()
//defer resp.Body.Close()
/* todo: move to rsm
for k, v := range resp.Header {
@@ -204,8 +220,9 @@ func (chain *ProxyChain) _execute() (io.Reader, error) {
return nil, chain.abort(err)
}
}
log.Println("D")
return chain.Body, nil
return chain.Response.Body, nil
}
// Execute sends the request for the ProxyChain and returns the request to the sender
@@ -214,12 +231,23 @@ func (chain *ProxyChain) _execute() (io.Reader, error) {
// be returned to the client
func (chain *ProxyChain) Execute() error {
defer chain._reset()
log.Println("1")
body, err := chain._execute()
log.Println("2")
if err != nil {
log.Println(err)
return err
}
log.Println("3")
log.Println(chain)
if chain.Context == nil {
return errors.New("no context set")
}
// Return request back to client
chain.Context.Set("content-type", "text/html")
return chain.Context.SendStream(body)
//return chain.Context.SendStream(body)
}
// reconstructUrlFromReferer reconstructs the URL using the referer's scheme, host, and the relative path / queries
@@ -232,9 +260,11 @@ func reconstructUrlFromReferer(referer *url.URL, relativeUrl *url.URL) (*url.URL
}
if realUrl.Scheme == "" || realUrl.Host == "" {
return nil, fmt.Errorf("invalid referer URL: %s", referer)
return nil, fmt.Errorf("invalid referer URL: '%s' on request '%s", referer, relativeUrl)
}
log.Printf("'%s' -> '%s'\n", relativeUrl.String(), realUrl.String())
return &url.URL{
Scheme: referer.Scheme,
Host: referer.Host,
@@ -251,11 +281,13 @@ func (chain *ProxyChain) extractUrl() (*url.URL, error) {
if err != nil {
reqUrl = chain.Context.Params("*") // fallback
}
fmt.Println(reqUrl)
urlQuery, err := url.Parse(reqUrl)
if err != nil {
return nil, fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err)
}
fmt.Println(urlQuery)
// Handle standard paths
// eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
@@ -338,9 +370,8 @@ func (chain *ProxyChain) abort(err error) error {
// internal function to reset state of ProxyChain for reuse
func (chain *ProxyChain) _reset() {
chain.abortErr = nil
chain.Body = nil
chain.Request = nil
chain.Response = nil
//chain.Response = nil
chain.Context = nil
}

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"io"
"ladder/proxychain"
"log"
"net/url"
"strings"
@@ -11,67 +12,94 @@ import (
)
type HTMLResourceURLRewriter struct {
src io.Reader
buffer *bytes.Buffer // buffer to temporarily hold rewritten output for the reader
proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL
proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL; TODO: implement
tokenizer *html.Tokenizer
currentToken html.Token
tokenBuffer *bytes.Buffer
currentTokenIndex int
currentTokenProcessed bool
}
func NewHTMLResourceURLRewriter(src io.Reader, proxyURL *url.URL) *HTMLResourceURLRewriter {
func NewHTMLResourceURLRewriter(src io.ReadCloser, proxyURL *url.URL) *HTMLResourceURLRewriter {
log.Println("tokenize")
return &HTMLResourceURLRewriter{
src: src,
buffer: new(bytes.Buffer),
tokenizer: html.NewTokenizer(src),
currentToken: html.Token{},
currentTokenIndex: 0,
tokenBuffer: new(bytes.Buffer),
proxyURL: proxyURL,
}
}
func rewriteToken(token *html.Token, baseURL *url.URL) {
attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true}
for i := range token.Attr {
attr := &token.Attr[i]
if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") {
// Make URL absolute
attr.Val = "/https://" + baseURL.Host + attr.Val
}
}
func (r *HTMLResourceURLRewriter) Close() error {
r.tokenBuffer.Reset()
r.currentToken = html.Token{}
r.currentTokenIndex = 0
r.currentTokenProcessed = false
return nil
}
func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) {
if r.buffer.Len() != 0 {
return r.buffer.Read(p)
}
tokenizer := html.NewTokenizer(r.src)
for {
tokenType := tokenizer.Next()
if r.currentToken.Data == "" || r.currentTokenProcessed {
tokenType := r.tokenizer.Next()
// done reading html, close out reader
if tokenType == html.ErrorToken {
err := tokenizer.Err()
if err == io.EOF {
return 0, io.EOF // End of document
if r.tokenizer.Err() == io.EOF {
return 0, io.EOF
}
return 0, err // Actual error
return 0, r.tokenizer.Err()
}
token := tokenizer.Token()
if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
rewriteToken(&token, r.proxyURL)
// flush the current token into an internal buffer
// to handle fragmented tokens
r.currentToken = r.tokenizer.Token()
r.tokenBuffer.Reset()
r.tokenBuffer.WriteString(r.currentToken.String())
r.currentTokenProcessed = false
r.currentTokenIndex = 0
}
r.buffer.WriteString(token.String())
if r.buffer.Len() > 0 {
break
n, err := r.tokenBuffer.Read(p)
if err == io.EOF || r.tokenBuffer.Len() == 0 {
r.currentTokenProcessed = true
err = nil // Reset error to nil because EOF in this context is expected and not an actual error
}
}
return r.buffer.Read(p)
return n, err
}
// RewriteHTMLResourceURLs updates src/href attributes in HTML content to route through the proxy.
func RewriteHTMLResourceURLs() proxychain.ResponseModification {
return func(chain *proxychain.ProxyChain) error {
log.Println("rhru")
ct := chain.Response.Header.Get("content-type")
if ct != "text/html" {
log.Println(ct)
if !strings.HasPrefix(ct, "text/html") {
return nil
}
// TODO: implement chaining rewriter chaining method
// so we can compose multiple body rewriters together
log.Println("rhru2")
// chain.Response.Body is an unread http.Response.Body
chain.Response.Body = NewHTMLResourceURLRewriter(chain.Response.Body, chain.Request.URL)
return nil
}
}
func rewriteToken(token *html.Token, baseURL *url.URL) {
log.Println(token.String())
attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true}
for i := range token.Attr {
attr := &token.Attr[i]
if attrsToRewrite[attr.Key] {
attr.Val = "/" + attr.Val
}
/*
if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") {
// Make URL absolute
attr.Val = "/https://" + baseURL.Host + attr.Val
}
*/
}
}