From 058260897afa47329a51005b0f8940dd47ba9259 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 13 Apr 2025 11:35:39 +0200 Subject: [PATCH 1/3] direct fetch optimization: add 'skipDirectFetchByExt' to skip direct fetch and just load via the browser --- src/crawler.ts | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/crawler.ts b/src/crawler.ts index 2b46b6b9..213717ad 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1012,6 +1012,24 @@ self.__bx_behaviors.selectMainBehavior(); return ""; } + skipDirectFetchByExt(url: string) { + const urlFull = new URL(url); + const extParts = urlFull.pathname.split("."); + if (extParts.length <= 1) { + return true; + } + const ext = extParts[1]; + if (["html", "htm", "asp", "php"].includes(ext)) { + return true; + } + + if (["pdf", "xml", "jpg", "webm", "docx", "mp4", "zip"].includes(ext)) { + return false; + } + + return false; + } + async crawlPage(opts: WorkerState): Promise { await this.writeStats(); @@ -1033,7 +1051,7 @@ self.__bx_behaviors.selectMainBehavior(); data.logDetails = logDetails; data.workerid = workerid; - if (recorder) { + if (recorder && !this.skipDirectFetchByExt(url)) { try { const headers = auth ? { Authorization: auth, ...this.headers } From cc1b52bde9d221075913b086ab3f393dee88cfe0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 23 Apr 2025 13:38:43 -0700 Subject: [PATCH 2/3] optimization: only do direct fetch if filename ends in known extension, otherwise just load in browser --- src/crawler.ts | 15 +++++++-------- src/util/constants.ts | 10 ++++++++++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 213717ad..ad06dc74 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -47,6 +47,7 @@ import { ExitCodes, InterruptReason, BxFunctionBindings, + DIRECT_FETCH_EXT, } from "./util/constants.js"; import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js"; @@ -1012,19 +1013,17 @@ self.__bx_behaviors.selectMainBehavior(); return ""; } - skipDirectFetchByExt(url: string) { + shouldDirectFetchByExt(url: string) { const urlFull = new URL(url); const extParts = urlFull.pathname.split("."); if (extParts.length <= 1) { - return true; + return false; } const ext = extParts[1]; - if (["html", "htm", "asp", "php"].includes(ext)) { - return true; - } - if (["pdf", "xml", "jpg", "webm", "docx", "mp4", "zip"].includes(ext)) { - return false; + // known files that should be direct fetched + if (DIRECT_FETCH_EXT.includes(ext)) { + return true; } return false; @@ -1051,7 +1050,7 @@ self.__bx_behaviors.selectMainBehavior(); data.logDetails = logDetails; data.workerid = workerid; - if (recorder && !this.skipDirectFetchByExt(url)) { + if (recorder && this.shouldDirectFetchByExt(url)) { try { const headers = auth ? { Authorization: auth, ...this.headers } diff --git a/src/util/constants.ts b/src/util/constants.ts index d6185d4e..b90e7a64 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -22,6 +22,16 @@ export const DETECT_SITEMAP = ""; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; +export const DIRECT_FETCH_EXT = [ + "pdf", + "xml", + "jpg", + "webm", + "docx", + "mp4", + "zip", +]; + export enum BxFunctionBindings { BehaviorLogFunc = "__bx_log", AddLinkFunc = "__bx_addLink", From 1fb6c90627b894b1876a0ef4f9e2d5ea28fd9659 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 5 May 2025 17:43:29 -0700 Subject: [PATCH 3/3] direct fetch dedup: treat 206 and 0 (status unknown) as 200 to avoid duplicate fetches --- src/util/recorder.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index bbaffa43..7bf9b99e 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1361,7 +1361,11 @@ export class Recorder extends EventEmitter { url && method === "GET" && !isRedirectStatus(status) && - !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status)) + !(await this.crawlState.addIfNoDupe( + WRITE_DUPE_KEY, + url, + status === 206 || !status ? 200 : status, + )) ) { logNetwork("Skipping dupe", { url, status, ...this.logDetails }); return; @@ -1515,7 +1519,11 @@ class AsyncFetcher { if ( reqresp.method === "GET" && url && - !(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status)) + !(await crawlState.addIfNoDupe( + ASYNC_FETCH_DUPE_KEY, + url, + status === 206 || !status ? 200 : status, + )) ) { if (!this.ignoreDupe) { this.reqresp.asyncLoading = false;