rewrite resource URLs based on html tokenizer instead of regex
This commit is contained in:
@@ -83,7 +83,6 @@ type ProxyChain struct {
|
||||
Client *http.Client
|
||||
Request *http.Request
|
||||
Response *http.Response
|
||||
Body io.Reader
|
||||
requestModifications []RequestModification
|
||||
resultModifications []ResponseModification
|
||||
Ruleset *ruleset.RuleSet
|
||||
@@ -131,11 +130,17 @@ func (chain *ProxyChain) AddRuleset(rs *ruleset.RuleSet) *ProxyChain {
|
||||
}
|
||||
|
||||
func (chain *ProxyChain) _initialize_request() (*http.Request, error) {
|
||||
log.Println("ir 1")
|
||||
if chain.Context == nil {
|
||||
chain.abortErr = chain.abort(errors.New("no context set"))
|
||||
return nil, chain.abortErr
|
||||
}
|
||||
// initialize a request (without url)
|
||||
req, err := http.NewRequest(chain.Context.Method(), "", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
log.Println("ir 2")
|
||||
chain.Request = req
|
||||
switch chain.Context.Method() {
|
||||
case "GET":
|
||||
@@ -152,12 +157,16 @@ func (chain *ProxyChain) _initialize_request() (*http.Request, error) {
|
||||
return nil, fmt.Errorf("unsupported request method from client: '%s'", chain.Context.Method())
|
||||
}
|
||||
|
||||
// copy client request headers to upstream request headers
|
||||
forwardHeaders := func(key []byte, val []byte) {
|
||||
req.Header.Set(string(key), string(val))
|
||||
}
|
||||
clientHeaders := &chain.Context.Request().Header
|
||||
clientHeaders.VisitAll(forwardHeaders)
|
||||
log.Println("ir 3")
|
||||
/*
|
||||
// copy client request headers to upstream request headers
|
||||
forwardHeaders := func(key []byte, val []byte) {
|
||||
req.Header.Set(string(key), string(val))
|
||||
}
|
||||
clientHeaders := &chain.Context.Request().Header
|
||||
clientHeaders.VisitAll(forwardHeaders)
|
||||
*/
|
||||
log.Println("ir 4")
|
||||
|
||||
return req, nil
|
||||
}
|
||||
@@ -166,20 +175,27 @@ func (chain *ProxyChain) _initialize_request() (*http.Request, error) {
|
||||
// the caller is responsible for returning a response back to the requestor
|
||||
// the caller is also responsible for calling chain._reset() when they are done with the body
|
||||
func (chain *ProxyChain) _execute() (io.Reader, error) {
|
||||
if chain.validateCtxIsSet() != nil {
|
||||
if chain.validateCtxIsSet() != nil || chain.abortErr != nil {
|
||||
return nil, chain.abortErr
|
||||
}
|
||||
if chain.Request == nil {
|
||||
return nil, errors.New("proxychain request not yet initialized")
|
||||
}
|
||||
if chain.Request.URL.Scheme == "" {
|
||||
return nil, errors.New("request url not set or invalid. Check ProxyChain ReqMods for issues")
|
||||
}
|
||||
log.Println("A")
|
||||
|
||||
// Apply requestModifications to proxychain
|
||||
for _, applyRequestModificationsTo := range chain.requestModifications {
|
||||
log.Println("AA")
|
||||
log.Println(applyRequestModificationsTo)
|
||||
err := applyRequestModificationsTo(chain)
|
||||
if err != nil {
|
||||
return nil, chain.abort(err)
|
||||
}
|
||||
}
|
||||
log.Println("B")
|
||||
|
||||
// Send Request Upstream
|
||||
resp, err := chain.Client.Do(chain.Request)
|
||||
@@ -187,9 +203,9 @@ func (chain *ProxyChain) _execute() (io.Reader, error) {
|
||||
return nil, chain.abort(err)
|
||||
}
|
||||
chain.Response = resp
|
||||
chain.Body = chain.Response.Body
|
||||
log.Println("C")
|
||||
|
||||
defer resp.Body.Close()
|
||||
//defer resp.Body.Close()
|
||||
|
||||
/* todo: move to rsm
|
||||
for k, v := range resp.Header {
|
||||
@@ -204,8 +220,9 @@ func (chain *ProxyChain) _execute() (io.Reader, error) {
|
||||
return nil, chain.abort(err)
|
||||
}
|
||||
}
|
||||
log.Println("D")
|
||||
|
||||
return chain.Body, nil
|
||||
return chain.Response.Body, nil
|
||||
}
|
||||
|
||||
// Execute sends the request for the ProxyChain and returns the request to the sender
|
||||
@@ -214,12 +231,23 @@ func (chain *ProxyChain) _execute() (io.Reader, error) {
|
||||
// be returned to the client
|
||||
func (chain *ProxyChain) Execute() error {
|
||||
defer chain._reset()
|
||||
log.Println("1")
|
||||
body, err := chain._execute()
|
||||
log.Println("2")
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return err
|
||||
}
|
||||
log.Println("3")
|
||||
log.Println(chain)
|
||||
if chain.Context == nil {
|
||||
return errors.New("no context set")
|
||||
}
|
||||
// Return request back to client
|
||||
chain.Context.Set("content-type", "text/html")
|
||||
return chain.Context.SendStream(body)
|
||||
|
||||
//return chain.Context.SendStream(body)
|
||||
}
|
||||
|
||||
// reconstructUrlFromReferer reconstructs the URL using the referer's scheme, host, and the relative path / queries
|
||||
@@ -232,9 +260,11 @@ func reconstructUrlFromReferer(referer *url.URL, relativeUrl *url.URL) (*url.URL
|
||||
}
|
||||
|
||||
if realUrl.Scheme == "" || realUrl.Host == "" {
|
||||
return nil, fmt.Errorf("invalid referer URL: %s", referer)
|
||||
return nil, fmt.Errorf("invalid referer URL: '%s' on request '%s", referer, relativeUrl)
|
||||
}
|
||||
|
||||
log.Printf("'%s' -> '%s'\n", relativeUrl.String(), realUrl.String())
|
||||
|
||||
return &url.URL{
|
||||
Scheme: referer.Scheme,
|
||||
Host: referer.Host,
|
||||
@@ -251,11 +281,13 @@ func (chain *ProxyChain) extractUrl() (*url.URL, error) {
|
||||
if err != nil {
|
||||
reqUrl = chain.Context.Params("*") // fallback
|
||||
}
|
||||
fmt.Println(reqUrl)
|
||||
|
||||
urlQuery, err := url.Parse(reqUrl)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing request URL '%s': %v", reqUrl, err)
|
||||
}
|
||||
fmt.Println(urlQuery)
|
||||
|
||||
// Handle standard paths
|
||||
// eg: https://localhost:8080/https://realsite.com/images/foobar.jpg -> https://realsite.com/images/foobar.jpg
|
||||
@@ -338,9 +370,8 @@ func (chain *ProxyChain) abort(err error) error {
|
||||
// internal function to reset state of ProxyChain for reuse
|
||||
func (chain *ProxyChain) _reset() {
|
||||
chain.abortErr = nil
|
||||
chain.Body = nil
|
||||
chain.Request = nil
|
||||
chain.Response = nil
|
||||
//chain.Response = nil
|
||||
chain.Context = nil
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"bytes"
|
||||
"io"
|
||||
"ladder/proxychain"
|
||||
"log"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
@@ -11,67 +12,94 @@ import (
|
||||
)
|
||||
|
||||
type HTMLResourceURLRewriter struct {
|
||||
src io.Reader
|
||||
buffer *bytes.Buffer // buffer to temporarily hold rewritten output for the reader
|
||||
proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL
|
||||
proxyURL *url.URL // proxyURL is the URL of the proxy, not the upstream URL; TODO: implement
|
||||
tokenizer *html.Tokenizer
|
||||
currentToken html.Token
|
||||
tokenBuffer *bytes.Buffer
|
||||
currentTokenIndex int
|
||||
currentTokenProcessed bool
|
||||
}
|
||||
|
||||
func NewHTMLResourceURLRewriter(src io.Reader, proxyURL *url.URL) *HTMLResourceURLRewriter {
|
||||
func NewHTMLResourceURLRewriter(src io.ReadCloser, proxyURL *url.URL) *HTMLResourceURLRewriter {
|
||||
log.Println("tokenize")
|
||||
return &HTMLResourceURLRewriter{
|
||||
src: src,
|
||||
buffer: new(bytes.Buffer),
|
||||
proxyURL: proxyURL,
|
||||
tokenizer: html.NewTokenizer(src),
|
||||
currentToken: html.Token{},
|
||||
currentTokenIndex: 0,
|
||||
tokenBuffer: new(bytes.Buffer),
|
||||
proxyURL: proxyURL,
|
||||
}
|
||||
}
|
||||
|
||||
func rewriteToken(token *html.Token, baseURL *url.URL) {
|
||||
attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true}
|
||||
for i := range token.Attr {
|
||||
attr := &token.Attr[i]
|
||||
if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") {
|
||||
// Make URL absolute
|
||||
attr.Val = "/https://" + baseURL.Host + attr.Val
|
||||
}
|
||||
}
|
||||
func (r *HTMLResourceURLRewriter) Close() error {
|
||||
r.tokenBuffer.Reset()
|
||||
r.currentToken = html.Token{}
|
||||
r.currentTokenIndex = 0
|
||||
r.currentTokenProcessed = false
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) {
|
||||
if r.buffer.Len() != 0 {
|
||||
return r.buffer.Read(p)
|
||||
}
|
||||
|
||||
tokenizer := html.NewTokenizer(r.src)
|
||||
for {
|
||||
tokenType := tokenizer.Next()
|
||||
if r.currentToken.Data == "" || r.currentTokenProcessed {
|
||||
tokenType := r.tokenizer.Next()
|
||||
|
||||
// done reading html, close out reader
|
||||
if tokenType == html.ErrorToken {
|
||||
err := tokenizer.Err()
|
||||
if err == io.EOF {
|
||||
return 0, io.EOF // End of document
|
||||
if r.tokenizer.Err() == io.EOF {
|
||||
return 0, io.EOF
|
||||
}
|
||||
return 0, err // Actual error
|
||||
return 0, r.tokenizer.Err()
|
||||
}
|
||||
|
||||
token := tokenizer.Token()
|
||||
if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
|
||||
rewriteToken(&token, r.proxyURL)
|
||||
}
|
||||
r.buffer.WriteString(token.String())
|
||||
if r.buffer.Len() > 0 {
|
||||
break
|
||||
}
|
||||
// flush the current token into an internal buffer
|
||||
// to handle fragmented tokens
|
||||
r.currentToken = r.tokenizer.Token()
|
||||
r.tokenBuffer.Reset()
|
||||
r.tokenBuffer.WriteString(r.currentToken.String())
|
||||
r.currentTokenProcessed = false
|
||||
r.currentTokenIndex = 0
|
||||
}
|
||||
return r.buffer.Read(p)
|
||||
|
||||
n, err := r.tokenBuffer.Read(p)
|
||||
|
||||
if err == io.EOF || r.tokenBuffer.Len() == 0 {
|
||||
r.currentTokenProcessed = true
|
||||
err = nil // Reset error to nil because EOF in this context is expected and not an actual error
|
||||
}
|
||||
return n, err
|
||||
|
||||
}
|
||||
|
||||
// RewriteHTMLResourceURLs updates src/href attributes in HTML content to route through the proxy.
|
||||
func RewriteHTMLResourceURLs() proxychain.ResponseModification {
|
||||
return func(chain *proxychain.ProxyChain) error {
|
||||
log.Println("rhru")
|
||||
ct := chain.Response.Header.Get("content-type")
|
||||
if ct != "text/html" {
|
||||
log.Println(ct)
|
||||
if !strings.HasPrefix(ct, "text/html") {
|
||||
return nil
|
||||
}
|
||||
// TODO: implement chaining rewriter chaining method
|
||||
// so we can compose multiple body rewriters together
|
||||
log.Println("rhru2")
|
||||
// chain.Response.Body is an unread http.Response.Body
|
||||
chain.Response.Body = NewHTMLResourceURLRewriter(chain.Response.Body, chain.Request.URL)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func rewriteToken(token *html.Token, baseURL *url.URL) {
|
||||
log.Println(token.String())
|
||||
attrsToRewrite := map[string]bool{"href": true, "src": true, "action": true, "srcset": true}
|
||||
for i := range token.Attr {
|
||||
attr := &token.Attr[i]
|
||||
if attrsToRewrite[attr.Key] {
|
||||
attr.Val = "/" + attr.Val
|
||||
}
|
||||
/*
|
||||
if attrsToRewrite[attr.Key] && strings.HasPrefix(attr.Val, "/") {
|
||||
// Make URL absolute
|
||||
attr.Val = "/https://" + baseURL.Host + attr.Val
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user