organize rewriters

This commit is contained in:
Kevin Pham
2023-11-21 18:44:33 -06:00
parent 0fc0942095
commit 0e620e46ab
4 changed files with 493 additions and 332 deletions

View File

@@ -0,0 +1,3 @@
package rewriters
// todo: implement

View File

@@ -0,0 +1,313 @@
package rewriters
import (
"bytes"
_ "embed"
"fmt"
"io"
"log"
"net/url"
"strings"
"golang.org/x/net/html"
)
var attributesToRewrite map[string]bool
var schemeBlacklist map[string]bool
func init() {
// Define list of HTML attributes to try to rewrite
attributesToRewrite = map[string]bool{
"src": true,
"href": true,
"action": true,
"srcset": true,
"poster": true,
"data": true,
"cite": true,
"formaction": true,
"background": true,
"usemap": true,
"longdesc": true,
"manifest": true,
"archive": true,
"codebase": true,
"icon": true,
"pluginspage": true,
}
// define URIs to NOT rewrite
// for example: don't overwrite <img src="data:image/png;base64;iVBORw...">"
schemeBlacklist = map[string]bool{
"data": true,
"tel": true,
"mailto": true,
"file": true,
"blob": true,
"javascript": true,
"about": true,
"magnet": true,
"ws": true,
"wss": true,
"ftp": true,
}
}
// HTMLResourceURLRewriter is a struct that rewrites URLs within HTML resources to use a specified proxy URL.
// It uses an HTML tokenizer to process HTML content and rewrites URLs in src/href attributes.
// <img src='/relative_path'> -> <img src='/https://proxiedsite.com/relative_path'>
type HTMLResourceURLRewriter struct {
baseURL *url.URL
tokenizer *html.Tokenizer
currentToken html.Token
tokenBuffer *bytes.Buffer
scriptContentBuffer *bytes.Buffer
insideScript bool
currentTokenIndex int
currentTokenProcessed bool
proxyURL string // ladder URL, not proxied site URL
}
// NewHTMLResourceURLRewriter creates a new instance of HTMLResourceURLRewriter.
// It initializes the tokenizer with the provided source and sets the proxy URL.
func NewHTMLResourceURLRewriter(src io.ReadCloser, baseURL *url.URL, proxyURL string) *HTMLResourceURLRewriter {
return &HTMLResourceURLRewriter{
tokenizer: html.NewTokenizer(src),
currentToken: html.Token{},
currentTokenIndex: 0,
tokenBuffer: new(bytes.Buffer),
scriptContentBuffer: new(bytes.Buffer),
insideScript: false,
baseURL: baseURL,
proxyURL: proxyURL,
}
}
// Close resets the internal state of HTMLResourceURLRewriter, clearing buffers and token data.
func (r *HTMLResourceURLRewriter) Close() error {
r.tokenBuffer.Reset()
r.currentToken = html.Token{}
r.currentTokenIndex = 0
r.currentTokenProcessed = false
return nil
}
// Read processes the HTML content, rewriting URLs and managing the state of tokens.
// It reads HTML content, token by token, rewriting URLs to route through the specified proxy.
func (r *HTMLResourceURLRewriter) Read(p []byte) (int, error) {
if r.currentToken.Data == "" || r.currentTokenProcessed {
tokenType := r.tokenizer.Next()
// done reading html, close out reader
if tokenType == html.ErrorToken {
if r.tokenizer.Err() == io.EOF {
return 0, io.EOF
}
return 0, r.tokenizer.Err()
}
// flush the current token into an internal buffer
// to handle fragmented tokens
r.currentToken = r.tokenizer.Token()
// patch tokens with URLs
isTokenWithAttribute := r.currentToken.Type == html.StartTagToken || r.currentToken.Type == html.SelfClosingTagToken
if isTokenWithAttribute {
patchResourceURL(&r.currentToken, r.baseURL, r.proxyURL)
}
r.tokenBuffer.Reset()
// unescape script contents, not sure why tokenizer will escape things
switch tokenType {
case html.StartTagToken:
if r.currentToken.Data == "script" {
r.insideScript = true
r.scriptContentBuffer.Reset() // Reset buffer for new script contents
}
r.tokenBuffer.WriteString(r.currentToken.String()) // Write the start tag
case html.EndTagToken:
if r.currentToken.Data == "script" {
r.insideScript = false
modScript := modifyInlineScript(r.scriptContentBuffer)
r.tokenBuffer.WriteString(modScript)
}
r.tokenBuffer.WriteString(r.currentToken.String())
default:
if r.insideScript {
r.scriptContentBuffer.WriteString(r.currentToken.String())
} else {
r.tokenBuffer.WriteString(r.currentToken.String())
}
}
// inject <script> right after <head>
isHeadToken := (r.currentToken.Type == html.StartTagToken || r.currentToken.Type == html.SelfClosingTagToken) && r.currentToken.Data == "head"
if isHeadToken {
injectScript(r.tokenBuffer, rewriteJSResourceUrlsScript)
}
r.currentTokenProcessed = false
r.currentTokenIndex = 0
}
n, err := r.tokenBuffer.Read(p)
if err == io.EOF || r.tokenBuffer.Len() == 0 {
r.currentTokenProcessed = true
err = nil // EOF in this context is expected and not an actual error
}
return n, err
}
// fetch("/relative_script.js") -> fetch("http://localhost:8080/relative_script.js")
//
//go:embed js_resource_url_rewriter.js
var rewriteJSResourceUrlsScript string
func injectScript(tokenBuffer *bytes.Buffer, script string) {
tokenBuffer.WriteString(
fmt.Sprintf("\n<script>\n%s\n</script>\n", script),
)
}
// possible ad-blocking / bypassing opportunity here
func modifyInlineScript(scriptContentBuffer *bytes.Buffer) string {
return html.UnescapeString(scriptContentBuffer.String())
}
// Root-relative URLs: These are relative to the root path and start with a "/".
func handleRootRelativePath(attr *html.Attribute, baseURL *url.URL) {
// doublecheck this is a valid relative URL
_, err := url.Parse(fmt.Sprintf("http://localhost.com%s", attr.Val))
if err != nil {
return
}
//log.Printf("BASEURL patch: %s\n", baseURL)
attr.Val = fmt.Sprintf(
"/%s://%s/%s",
baseURL.Scheme,
baseURL.Host,
strings.TrimPrefix(attr.Val, "/"),
)
attr.Val = url.QueryEscape(attr.Val)
attr.Val = fmt.Sprintf("/%s", attr.Val)
log.Printf("root rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
// Document-relative URLs: These are relative to the current document's path and don't start with a "/".
func handleDocumentRelativePath(attr *html.Attribute, baseURL *url.URL) {
attr.Val = fmt.Sprintf(
"%s://%s/%s%s",
baseURL.Scheme,
strings.Trim(baseURL.Host, "/"),
strings.Trim(baseURL.RawPath, "/"),
strings.Trim(attr.Val, "/"),
)
attr.Val = url.QueryEscape(attr.Val)
attr.Val = fmt.Sprintf("/%s", attr.Val)
log.Printf("doc rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
// Protocol-relative URLs: These start with "//" and will use the same protocol (http or https) as the current page.
func handleProtocolRelativePath(attr *html.Attribute, baseURL *url.URL) {
attr.Val = strings.TrimPrefix(attr.Val, "/")
handleRootRelativePath(attr, baseURL)
log.Printf("proto rel url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
func handleAbsolutePath(attr *html.Attribute, baseURL *url.URL) {
// check if valid URL
u, err := url.Parse(attr.Val)
if err != nil {
return
}
if !(u.Scheme == "http" || u.Scheme == "https") {
return
}
attr.Val = fmt.Sprintf(
"/%s",
url.QueryEscape(
strings.TrimPrefix(attr.Val, "/"),
),
)
log.Printf("abs url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
func handleSrcSet(attr *html.Attribute, baseURL *url.URL) {
for i, src := range strings.Split(attr.Val, ",") {
src = strings.Trim(src, " ")
for j, s := range strings.Split(src, " ") {
s = strings.Trim(s, " ")
if j == 0 {
f := &html.Attribute{Val: s, Key: attr.Key}
switch {
case strings.HasPrefix(s, "//"):
handleProtocolRelativePath(f, baseURL)
case strings.HasPrefix(s, "/"):
handleRootRelativePath(f, baseURL)
case strings.HasPrefix(s, "https://") || strings.HasPrefix(s, "http://"):
handleAbsolutePath(f, baseURL)
default:
handleDocumentRelativePath(f, baseURL)
}
s = f.Val
}
if i == 0 && j == 0 {
attr.Val = s
continue
}
attr.Val = fmt.Sprintf("%s %s", attr.Val, s)
}
attr.Val = fmt.Sprintf("%s,", attr.Val)
}
attr.Val = strings.TrimSuffix(attr.Val, ",")
log.Printf("srcset url rewritten-> '%s'='%s'", attr.Key, attr.Val)
}
func isBlackedlistedScheme(url string) bool {
spl := strings.Split(url, ":")
if len(spl) == 0 {
return false
}
scheme := spl[0]
return schemeBlacklist[scheme]
}
func patchResourceURL(token *html.Token, baseURL *url.URL, proxyURL string) {
for i := range token.Attr {
attr := &token.Attr[i]
switch {
// don't touch attributes except for the ones we defined
case !attributesToRewrite[attr.Key]:
continue
// don't rewrite special URIs that don't make network requests
case isBlackedlistedScheme(attr.Val):
continue
// don't double-overwrite the url
case strings.HasPrefix(attr.Val, proxyURL):
continue
case attr.Key == "srcset":
handleSrcSet(attr, baseURL)
continue
case strings.HasPrefix(attr.Val, "//"):
handleProtocolRelativePath(attr, baseURL)
continue
case strings.HasPrefix(attr.Val, "/"):
handleRootRelativePath(attr, baseURL)
continue
case strings.HasPrefix(attr.Val, "https://") || strings.HasPrefix(attr.Val, "http://"):
handleAbsolutePath(attr, baseURL)
continue
default:
handleDocumentRelativePath(attr, baseURL)
continue
}
}
}

View File

@@ -0,0 +1,167 @@
// Overrides the global fetch and XMLHttpRequest open methods to modify the request URLs.
// Also overrides the attribute setter prototype to modify the request URLs
// fetch("/relative_script.js") -> fetch("http://localhost:8080/relative_script.js")
(() => {
const blacklistedSchemes = [
"ftp:",
"mailto:",
"tel:",
"file:",
"blob:",
"javascript:",
"about:",
"magnet:",
"ws:",
"wss:",
];
function rewriteURL(url) {
const oldUrl = url
if (!url) return url
let isStr = (typeof url.startsWith === 'function')
if (!isStr) return url
// don't rewrite invalid URIs
try { new URL(url) } catch { return url }
// don't rewrite special URIs
if (blacklistedSchemes.includes(url)) return url;
// don't double rewrite
const proxyOrigin = globalThis.window.location.origin;
if (url.startsWith(proxyOrigin)) return url;
if (url.startsWith(`/${proxyOrigin}`)) return url;
if (url.startsWith(`/${origin}`)) return url;
const origin = (new URL(decodeURIComponent(globalThis.window.location.pathname.substring(1)))).origin
//console.log(`proxychain: origin: ${origin} // proxyOrigin: ${proxyOrigin} // original: ${oldUrl}`)
if (url.startsWith("//")) {
url = `/${origin}/${encodeURIComponent(url.substring(2))}`;
} else if (url.startsWith("/")) {
url = `/${origin}/${encodeURIComponent(url.substring(1))}`;
} else if (url.startsWith(origin)) {
url = `/${encodeURIComponent(url)}`
} else if (url.startsWith("http://") || url.startsWith("https://")) {
url = `/${proxyOrigin}/${encodeURIComponent(url)}`;
}
console.log(`proxychain: rewrite JS URL: ${oldUrl} -> ${url}`)
return url;
};
// monkey patch fetch
const oldFetch = globalThis.fetch;
globalThis.fetch = async (url, init) => {
return oldFetch(rewriteURL(url), init)
}
// monkey patch xmlhttprequest
const oldOpen = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function(method, url, async = true, user = null, password = null) {
return oldOpen.call(this, method, rewriteURL(url), async, user, password);
};
const oldSend = XMLHttpRequest.prototype.send;
XMLHttpRequest.prototype.send = function(method, url) {
return oldSend.call(this, method, rewriteURL(url));
};
// monkey patch service worker registration
const oldRegister = ServiceWorkerContainer.prototype.register;
ServiceWorkerContainer.prototype.register = function(scriptURL, options) {
return oldRegister.call(this, rewriteURL(scriptURL), options)
}
// monkey patch URL.toString() method
const oldToString = URL.prototype.toString
URL.prototype.toString = function() {
let originalURL = oldToString.call(this)
return rewriteURL(originalURL)
}
// monkey patch URL.toJSON() method
const oldToJson = URL.prototype.toString
URL.prototype.toString = function() {
let originalURL = oldToJson.call(this)
return rewriteURL(originalURL)
}
// Monkey patch URL.href getter and setter
const originalHrefDescriptor = Object.getOwnPropertyDescriptor(URL.prototype, 'href');
Object.defineProperty(URL.prototype, 'href', {
get: function() {
let originalHref = originalHrefDescriptor.get.call(this);
return rewriteURL(originalHref)
},
set: function(newValue) {
originalHrefDescriptor.set.call(this, rewriteURL(newValue));
}
});
// Monkey patch setter
const elements = [
{ tag: 'a', attribute: 'href' },
{ tag: 'img', attribute: 'src' },
// { tag: 'img', attribute: 'srcset' }, // TODO: handle srcset
{ tag: 'script', attribute: 'src' },
{ tag: 'link', attribute: 'href' },
{ tag: 'link', attribute: 'icon' },
{ tag: 'iframe', attribute: 'src' },
{ tag: 'audio', attribute: 'src' },
{ tag: 'video', attribute: 'src' },
{ tag: 'source', attribute: 'src' },
// { tag: 'source', attribute: 'srcset' }, // TODO: handle srcset
{ tag: 'embed', attribute: 'src' },
{ tag: 'embed', attribute: 'pluginspage' },
{ tag: 'html', attribute: 'manifest' },
{ tag: 'object', attribute: 'src' },
{ tag: 'input', attribute: 'src' },
{ tag: 'track', attribute: 'src' },
{ tag: 'form', attribute: 'action' },
{ tag: 'area', attribute: 'href' },
{ tag: 'base', attribute: 'href' },
{ tag: 'blockquote', attribute: 'cite' },
{ tag: 'del', attribute: 'cite' },
{ tag: 'ins', attribute: 'cite' },
{ tag: 'q', attribute: 'cite' },
{ tag: 'button', attribute: 'formaction' },
{ tag: 'input', attribute: 'formaction' },
{ tag: 'meta', attribute: 'content' },
{ tag: 'object', attribute: 'data' },
];
elements.forEach(({ tag, attribute }) => {
const proto = document.createElement(tag).constructor.prototype;
const descriptor = Object.getOwnPropertyDescriptor(proto, attribute);
if (descriptor && descriptor.set) {
Object.defineProperty(proto, attribute, {
...descriptor,
set(value) {
// calling rewriteURL will end up calling a setter for href,
// leading to a recusive loop and a Maximum call stack size exceeded
// error, so we guard against this with a local semaphore flag
const isRewritingSetKey = Symbol.for('isRewritingSet');
if (!this[isRewritingSetKey]) {
this[isRewritingSetKey] = true;
descriptor.set.call(this, rewriteURL(value));
//descriptor.set.call(this, value);
this[isRewritingSetKey] = false;
} else {
// Directly set the value without rewriting
descriptor.set.call(this, value);
}
},
get() {
const isRewritingGetKey = Symbol.for('isRewritingGet');
if (!this[isRewritingGetKey]) {
this[isRewritingGetKey] = true;
let oldURL = descriptor.get.call(this);
let newURL = rewriteURL(oldURL);
this[isRewritingGetKey] = false;
return newURL
} else {
return descriptor.get.call(this);
}
}
});
}
});
})();