Skip to content

Commit 303975f

Browse files
Replace regex with DOMParser in GPT document.write rewriting
1 parent 2759729 commit 303975f

2 files changed

Lines changed: 193 additions & 30 deletions

File tree

crates/js/lib/src/integrations/gpt/script_guard.ts

Lines changed: 43 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -190,43 +190,56 @@ function rewriteLinkHref(element: HTMLLinkElement): void {
190190
// Layer 1: document.write / document.writeln interception
191191
// ---------------------------------------------------------------------------
192192

193-
/**
194-
* Regex that matches `src="..."` or `src='...'` attributes inside a
195-
* `<script>` tag where the URL text mentions the GPT domain token. We capture:
196-
* 1. Everything before the URL (the `src=` prefix with quote)
197-
* 2. The URL itself
198-
* 3. Everything after the URL (the closing quote)
199-
*
200-
* This handles the HTML that GPT's `Xd` function produces, e.g.:
201-
* `<script src="https://securepubads.g.doubleclick.net/pagead/…/pubads_impl.js" …></script>`
202-
*
203-
* Hostname verification still happens in [`maybeRewrite`], so URLs that merely
204-
* contain the token in query text are left unchanged.
205-
*/
206-
const SCRIPT_SRC_RE =
207-
/(<script\b[^>]*?\bsrc\s*=\s*["'])([^"']*securepubads\.g\.doubleclick\.net[^"']*)(["'])/gi;
208-
209193
/**
210194
* Rewrite GPT domain URLs inside raw HTML strings passed to
211195
* `document.write` / `document.writeln`.
196+
*
197+
* Uses `DOMParser` for robust HTML parsing instead of regex so that
198+
* edge-cases (unquoted attributes, unusual spacing, mixed quote styles)
199+
* are handled by the browser's native parser. The raw `getAttribute`
200+
* value is swapped in the original HTML string so the surrounding markup
201+
* is preserved verbatim.
202+
*
203+
* If the GPT domain is present in the HTML but `DOMParser` is
204+
* unavailable or throws, the function **fails closed** (returns an
205+
* empty string) rather than passing the unproxied URL through.
212206
*/
213207
function rewriteHtmlString(html: string): string {
214-
SCRIPT_SRC_RE.lastIndex = 0;
215-
if (!SCRIPT_SRC_RE.test(html)) return html;
216-
SCRIPT_SRC_RE.lastIndex = 0;
217-
218-
return html.replace(SCRIPT_SRC_RE, (_match, prefix: string, url: string, suffix: string) => {
219-
const { url: rewrittenUrl, didRewrite } = maybeRewrite(url);
220-
if (!didRewrite) {
221-
return `${prefix}${url}${suffix}`;
208+
// Fast path: nothing to do when the GPT domain isn't mentioned at all.
209+
if (!html.includes(GPT_DOMAIN)) return html;
210+
211+
if (typeof DOMParser === 'undefined') {
212+
log.warn(
213+
`${LOG_PREFIX}: DOMParser unavailable, blocking document.write HTML that references GPT domain`
214+
);
215+
return '';
216+
}
217+
218+
try {
219+
const doc = new DOMParser().parseFromString(html, 'text/html');
220+
const scripts = doc.querySelectorAll('script[src]');
221+
let result = html;
222+
223+
for (const script of scripts) {
224+
const rawSrc = script.getAttribute('src') ?? '';
225+
const { url: rewrittenUrl, didRewrite } = maybeRewrite(rawSrc);
226+
if (!didRewrite) continue;
227+
228+
log.info(`${LOG_PREFIX}: rewriting document.write script src`, {
229+
original: rawSrc,
230+
rewritten: rewrittenUrl,
231+
});
232+
result = result.replaceAll(rawSrc, rewrittenUrl);
222233
}
223234

224-
log.info(`${LOG_PREFIX}: rewriting document.write script src`, {
225-
original: url,
226-
rewritten: rewrittenUrl,
227-
});
228-
return `${prefix}${rewrittenUrl}${suffix}`;
229-
});
235+
return result;
236+
} catch (err) {
237+
log.warn(
238+
`${LOG_PREFIX}: failed to parse document.write HTML containing GPT domain, blocking`,
239+
err
240+
);
241+
return '';
242+
}
230243
}
231244

232245
function installDocumentWritePatch(): void {

crates/js/lib/test/integrations/gpt/script_guard.test.ts

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,4 +155,154 @@ describe('GPT script guard', () => {
155155
'/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js?foo=bar'
156156
);
157157
});
158+
159+
// -----------------------------------------------------------------------
160+
// document.write edge-cases (DOMParser-based rewriting)
161+
// -----------------------------------------------------------------------
162+
163+
it('rewrites document.write script src with single-quoted attribute', () => {
164+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
165+
document.write = nativeWriteSpy as unknown as typeof document.write;
166+
167+
installGptGuard();
168+
169+
document.write(
170+
"<script src='https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js'></script>"
171+
);
172+
173+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
174+
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
175+
expect(writtenHtml).toContain(window.location.host);
176+
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
177+
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
178+
});
179+
180+
it('rewrites document.write script src with extra whitespace around =', () => {
181+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
182+
document.write = nativeWriteSpy as unknown as typeof document.write;
183+
184+
installGptGuard();
185+
186+
document.write(
187+
'<script src = "https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js" ></script>'
188+
);
189+
190+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
191+
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
192+
expect(writtenHtml).toContain(window.location.host);
193+
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
194+
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
195+
});
196+
197+
it('rewrites multiple script tags in a single document.write call', () => {
198+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
199+
document.write = nativeWriteSpy as unknown as typeof document.write;
200+
201+
installGptGuard();
202+
203+
document.write(
204+
'<script src="https://securepubads.g.doubleclick.net/pagead/a.js"></script>' +
205+
'<script src="https://securepubads.g.doubleclick.net/pagead/b.js"></script>'
206+
);
207+
208+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
209+
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
210+
expect(writtenHtml).toContain('/integrations/gpt/pagead/a.js');
211+
expect(writtenHtml).toContain('/integrations/gpt/pagead/b.js');
212+
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
213+
});
214+
215+
it('rewrites document.writeln the same as document.write', () => {
216+
const nativeWritelnSpy = vi.fn<(...args: string[]) => void>();
217+
document.writeln = nativeWritelnSpy as unknown as typeof document.writeln;
218+
219+
installGptGuard();
220+
221+
document.writeln(
222+
'<script src="https://securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js"></script>'
223+
);
224+
225+
expect(nativeWritelnSpy).toHaveBeenCalledTimes(1);
226+
const [writtenHtml] = nativeWritelnSpy.mock.calls[0] ?? [];
227+
expect(writtenHtml).toContain(window.location.host);
228+
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
229+
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
230+
});
231+
232+
it('passes through HTML with no GPT domain reference unchanged', () => {
233+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
234+
document.write = nativeWriteSpy as unknown as typeof document.write;
235+
236+
installGptGuard();
237+
238+
const html = '<script src="https://example.com/tracker.js"></script>';
239+
document.write(html);
240+
241+
expect(nativeWriteSpy).toHaveBeenCalledWith(html);
242+
});
243+
244+
it('rewrites protocol-relative GPT URLs in document.write', () => {
245+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
246+
document.write = nativeWriteSpy as unknown as typeof document.write;
247+
248+
installGptGuard();
249+
250+
document.write(
251+
'<script src="//securepubads.g.doubleclick.net/pagead/managed/js/gpt/current/pubads_impl.js"></script>'
252+
);
253+
254+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
255+
const [writtenHtml] = nativeWriteSpy.mock.calls[0] ?? [];
256+
expect(writtenHtml).toContain(window.location.host);
257+
expect(writtenHtml).toContain('/integrations/gpt/pagead/managed/js/gpt/current/pubads_impl.js');
258+
expect(writtenHtml).not.toContain('securepubads.g.doubleclick.net');
259+
});
260+
261+
// -----------------------------------------------------------------------
262+
// Fail-closed behaviour
263+
// -----------------------------------------------------------------------
264+
265+
it('fails closed when DOMParser is unavailable', () => {
266+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
267+
document.write = nativeWriteSpy as unknown as typeof document.write;
268+
269+
const originalDOMParser = globalThis.DOMParser;
270+
// @ts-expect-error — simulating an environment without DOMParser
271+
delete globalThis.DOMParser;
272+
273+
try {
274+
installGptGuard();
275+
276+
document.write('<script src="https://securepubads.g.doubleclick.net/pagead/a.js"></script>');
277+
278+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
279+
expect(nativeWriteSpy).toHaveBeenCalledWith('');
280+
} finally {
281+
globalThis.DOMParser = originalDOMParser;
282+
}
283+
});
284+
285+
it('fails closed when DOMParser throws', () => {
286+
const nativeWriteSpy = vi.fn<(...args: string[]) => void>();
287+
document.write = nativeWriteSpy as unknown as typeof document.write;
288+
289+
const originalDOMParser = globalThis.DOMParser;
290+
// @ts-expect-error — injecting a broken DOMParser
291+
globalThis.DOMParser = class {
292+
parseFromString() {
293+
throw new Error('boom');
294+
}
295+
};
296+
297+
try {
298+
installGptGuard();
299+
300+
document.write('<script src="https://securepubads.g.doubleclick.net/pagead/a.js"></script>');
301+
302+
expect(nativeWriteSpy).toHaveBeenCalledTimes(1);
303+
expect(nativeWriteSpy).toHaveBeenCalledWith('');
304+
} finally {
305+
globalThis.DOMParser = originalDOMParser;
306+
}
307+
});
158308
});

0 commit comments

Comments
 (0)