Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 56 additions & 30 deletions crates/js/lib/src/integrations/gpt/script_guard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -199,43 +199,69 @@ function rewriteLinkHref(
// Layer 1: document.write / document.writeln interception
// ---------------------------------------------------------------------------

/**
* Regex that matches `src="..."` or `src='...'` attributes inside a
* `<script>` tag where the URL text mentions the GPT domain token. We capture:
* 1. Everything before the URL (the `src=` prefix with quote)
* 2. The URL itself
* 3. Everything after the URL (the closing quote)
*
* This handles the HTML that GPT's `Xd` function produces, e.g.:
* `<script src="https://securepubads.g.doubleclick.net/pagead/…/pubads_impl.js" …></script>`
*
* Hostname verification still happens in [`maybeRewrite`], so URLs that merely
* contain the token in query text are left unchanged.
*/
const SCRIPT_SRC_RE =
/(<script\b[^>]*?\bsrc\s*=\s*["'])([^"']*securepubads\.g\.doubleclick\.net[^"']*)(["'])/gi;

/**
* Rewrite GPT domain URLs inside raw HTML strings passed to
* `document.write` / `document.writeln`.
*
* Uses `DOMParser` for robust HTML parsing instead of regex so that
* edge-cases (unquoted attributes, unusual spacing, mixed quote styles,
* HTML-entity-encoded query parameters) are handled by the browser's
* native parser. GPT script `src` attributes are mutated in the parsed
* DOM and the result is serialized back to HTML.
*
* If the GPT domain is present in the HTML but `DOMParser` is
* unavailable or throws, the function **fails closed** (returns an
* empty string) rather than passing the unproxied URL through.
*
* Non-GPT HTML is always passed through unchanged regardless of
* `DOMParser` availability.
*/
function rewriteHtmlString(html: string): string {
SCRIPT_SRC_RE.lastIndex = 0;
if (!SCRIPT_SRC_RE.test(html)) return html;
SCRIPT_SRC_RE.lastIndex = 0;

return html.replace(SCRIPT_SRC_RE, (_match, prefix: string, url: string, suffix: string) => {
const { url: rewrittenUrl, didRewrite } = maybeRewrite(url);
if (!didRewrite) {
return `${prefix}${url}${suffix}`;
// Fast-path: if the HTML does not reference the GPT domain at all,
// pass it through unchanged. This avoids unnecessary DOMParser
// overhead and, critically, prevents non-GPT document.write calls
// from being silently dropped when DOMParser is unavailable.
if (!html.includes(GPT_DOMAIN)) return html;

if (typeof DOMParser === 'undefined') {
log.warn(
`${LOG_PREFIX}: DOMParser unavailable, blocking document.write HTML that references GPT domain`
);
return '';
}

try {
const doc = new DOMParser().parseFromString(html, 'text/html');
const scripts = doc.querySelectorAll('script[src]');
let didRewriteAny = false;

for (const script of scripts) {
const rawSrc = script.getAttribute('src') ?? '';
const { url: rewrittenUrl, didRewrite } = maybeRewrite(rawSrc);
if (!didRewrite) continue;

log.info(`${LOG_PREFIX}: rewriting document.write script src`, {
original: rawSrc,
rewritten: rewrittenUrl,
});
// Mutate the parsed DOM so that HTML-entity-encoded attribute
// values (e.g. `&amp;`) are handled correctly. Serializing the
// DOM back to HTML avoids the mismatch between decoded
// `getAttribute()` values and the raw HTML string.
script.setAttribute('src', rewrittenUrl);
didRewriteAny = true;
}

log.info(`${LOG_PREFIX}: rewriting document.write script src`, {
original: url,
rewritten: rewrittenUrl,
});
return `${prefix}${rewrittenUrl}${suffix}`;
});
// DOMParser wraps input in <html><head>…</head><body>…</body></html>.
// Bare <script> tags land in <head>, so we serialize from both.
return didRewriteAny ? (doc.head?.innerHTML ?? '') + (doc.body?.innerHTML ?? '') : html;
} catch (err) {
log.warn(
`${LOG_PREFIX}: failed to parse document.write HTML containing GPT domain, blocking`,
err
);
return '';
}
}

function installDocumentWritePatch(): void {
Expand Down
192 changes: 192 additions & 0 deletions crates/js/lib/test/integrations/gpt/script_guard.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,196 @@ describe('GPT script guard', () => {
'/integrations/gpt/pagead/managed/js/gpt/m202603020101/pubads_impl.js?foo=bar'
);
});

// -----------------------------------------------------------------------
// document.write edge-cases (DOMParser-based rewriting)
// -----------------------------------------------------------------------

it('rewrites document.write script src with single-quoted attribute', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

installGptGuard();

document.write(
"<script src='https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js'></script>"
);
Comment on lines +167 to +169

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
expect(writtenHtml).toContain(window.location.host);
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
});

it('rewrites document.write script src with extra whitespace around =', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

installGptGuard();

document.write(
'<script src = "https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js" ></script>'
);
Comment on lines +184 to +186

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
expect(writtenHtml).toContain(window.location.host);
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
});

it('rewrites multiple script tags in a single document.write call', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

installGptGuard();

document.write(
'<script src="https://securepubads.g.doubleclick.net/pagead/a.js"></script>' +
'<script src="https://securepubads.g.doubleclick.net/pagead/b.js"></script>'
);
Comment on lines +201 to +204

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
expect(writtenHtml).toContain('/integrations/gpt/pagead/a.js');
expect(writtenHtml).toContain('/integrations/gpt/pagead/b.js');
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
});

it('rewrites document.writeln the same as document.write', () => {
const nativeWritelnSpy = vi.fn<(...args: string[]) => void>();
document.writeln = nativeWritelnSpy as unknown as typeof document.writeln;

installGptGuard();

document.writeln(
'<script src="https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js"></script>'
);
Comment on lines +219 to +221

expect(nativeWritelnSpy).toHaveBeenCalledTimes(1);
const [writtenHtml] = nativeWritelnSpy.mock.calls[0] ?? [];
expect(writtenHtml).toContain(window.location.host);
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
});

it('passes through HTML with no GPT domain reference unchanged', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

installGptGuard();

const html = '<script src="https://example.com/tracker.js"></script>';
document.write(html);

expect(nativeWriteSpy).toHaveBeenCalledWith(html);
});

it('rewrites protocol-relative GPT URLs in document.write', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

installGptGuard();

document.write(
'<script src="//securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js"></script>'
);
Comment on lines +248 to +250

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
expect(writtenHtml).toContain(window.location.host);
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
});

// -----------------------------------------------------------------------
// Fail-closed behaviour
// -----------------------------------------------------------------------

it('fails closed when DOMParser is unavailable', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

const originalDOMParser = globalThis.DOMParser;
// @ts-expect-error — simulating an environment without DOMParser
delete globalThis.DOMParser;

try {
installGptGuard();

document.write('<script src="https://securepubads.g.doubleclick.net/pagead/a.js"></script>');

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
expect(nativeWriteSpy).toHaveBeenCalledWith('');
} finally {
globalThis.DOMParser = originalDOMParser;
}
});

it('fails closed when DOMParser throws', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

const originalDOMParser = globalThis.DOMParser;
// @ts-expect-error — injecting a broken DOMParser
globalThis.DOMParser = class {
parseFromString() {
throw new Error('boom');
}
};

try {
installGptGuard();

document.write('<script src="https://securepubads.g.doubleclick.net/pagead/a.js"></script>');

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
expect(nativeWriteSpy).toHaveBeenCalledWith('');
} finally {
globalThis.DOMParser = originalDOMParser;
}
});

it('passes non-GPT HTML through unchanged when DOMParser is unavailable', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

const originalDOMParser = globalThis.DOMParser;
// @ts-expect-error — simulating an environment without DOMParser
delete globalThis.DOMParser;

try {
installGptGuard();

const html = '<p>Hello, world!</p>';
document.write(html);

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
expect(nativeWriteSpy).toHaveBeenCalledWith(html);
} finally {
globalThis.DOMParser = originalDOMParser;
}
});

// -----------------------------------------------------------------------
// HTML-entity-encoded URLs
// -----------------------------------------------------------------------

it('rewrites GPT URLs that contain HTML-escaped entities like &amp;', () => {
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
document.write = nativeWriteSpy as unknown as typeof document.write;

installGptGuard();

document.write(
'<script src="https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js?x=1&amp;y=2"></script>'
);
Comment on lines +338 to +340

expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
expect(writtenHtml).toContain(window.location.host);
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
});
});
Loading