Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 41 additions & 6 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import {
ExitCodes,
InterruptReason,
BxFunctionBindings,
SEED_REDIRECT_ADD_DELAY,
} from "./util/constants.js";

import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
Expand Down Expand Up @@ -592,7 +593,14 @@ export class Crawler {
extraChromeArgs() {
const args = [];
if (this.params.lang) {
args.push(`--accept-lang=${this.params.lang}`);
if (this.params.profile) {
logger.warn(
"Ignoring --lang option with profile, using language configured in the profile",
{ lang: this.params.lang },
);
} else {
args.push(`--accept-lang=${this.params.lang}`);
}
}
return args;
}
Expand Down Expand Up @@ -2123,6 +2131,8 @@ self.__bx_behaviors.selectMainBehavior();

const respUrl = resp.url().split("#")[0];
const isChromeError = page.url().startsWith("chrome-error://");
let thisPageDelay = 0;
let originalSeedId = null;

if (
depth === 0 &&
Expand All @@ -2131,6 +2141,7 @@ self.__bx_behaviors.selectMainBehavior();
respUrl + "/" !== url &&
!downloadResponse
) {
originalSeedId = data.seedId;
data.seedId = await this.crawlState.addExtraSeed(
this.seeds,
this.numOriginalSeeds,
Expand All @@ -2142,6 +2153,7 @@ self.__bx_behaviors.selectMainBehavior();
newUrl: respUrl,
seedId: data.seedId,
});
thisPageDelay = SEED_REDIRECT_ADD_DELAY;
}

const status = resp.status();
Expand Down Expand Up @@ -2228,7 +2240,7 @@ self.__bx_behaviors.selectMainBehavior();

await this.netIdle(page, logDetails);

await this.awaitPageLoad(page.mainFrame(), logDetails);
await this.awaitPageLoad(page.mainFrame(), thisPageDelay, logDetails);

// skip extraction if at max depth
if (seed.isAtMaxDepth(depth, extraHops)) {
Expand All @@ -2242,6 +2254,27 @@ self.__bx_behaviors.selectMainBehavior();
"links",
);

const pageUrl = page.url().split("#")[0];

if (depth === 0 && respUrl !== urlNoHash) {
if (pageUrl === urlNoHash && originalSeedId !== null) {
logger.info("Seed page redirected back to original seed", { pageUrl });
data.seedId = originalSeedId;
} else {
data.seedId = await this.crawlState.addExtraSeed(
this.seeds,
this.numOriginalSeeds,
data.seedId,
pageUrl,
);
logger.info("Seed page redirected, adding redirected seed", {
origUrl: respUrl,
newUrl: pageUrl,
seedId: data.seedId,
});
}
}

await this.extractLinks(page, data, this.params.selectLinks, logDetails);
}

Expand All @@ -2263,7 +2296,7 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async awaitPageLoad(frame: Frame, logDetails: LogDetails) {
async awaitPageLoad(frame: Frame, tempDelay: number, logDetails: LogDetails) {
if (this.params.behaviorOpts) {
try {
await timedRun(
Expand All @@ -2279,11 +2312,13 @@ self.__bx_behaviors.selectMainBehavior();
}
}

if (this.params.postLoadDelay) {
const delay = tempDelay + this.params.postLoadDelay;

if (delay) {
logger.info("Awaiting post load delay", {
seconds: this.params.postLoadDelay,
seconds: delay,
});
await sleep(this.params.postLoadDelay);
await sleep(delay);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/replaycrawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ export class ReplayCrawler extends Crawler {
// optionally reload (todo: reevaluate if this is needed)
// await page.reload();

await this.awaitPageLoad(replayFrame, logDetails);
await this.awaitPageLoad(replayFrame, 0, logDetails);

data.isHTMLPage = true;

Expand Down
1 change: 1 addition & 0 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export const DEFAULT_MAX_RETRIES = 2;
export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5;
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
export const SEED_REDIRECT_ADD_DELAY = 20;

export type ExtractSelector = {
selector: string;
Expand Down
15 changes: 12 additions & 3 deletions src/util/recorder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ export class Recorder extends EventEmitter {
return;
}

this.serializeToWARC(reqresp).catch((e) =>
this.serializeToWARC(reqresp, true).catch((e) =>
logger.warn("Error Serializing to WARC", e, "recorder"),
);
}
Expand Down Expand Up @@ -1327,7 +1327,7 @@ export class Recorder extends EventEmitter {
return reqresp;
}

async serializeToWARC(reqresp: RequestResponseInfo) {
async serializeToWARC(reqresp: RequestResponseInfo, fromFinished = false) {
// always include in pageinfo record if going to serialize to WARC
// even if serialization does not happen
this.addPageRecord(reqresp);
Expand Down Expand Up @@ -1371,6 +1371,15 @@ export class Recorder extends EventEmitter {
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);

this.writer.writeRecordPair(responseRecord, requestRecord);

// edge case: from finished response load, and page response and no mime type or status != 200, possibly a captcha/sso page
// allow it to be captured again
if (
(fromFinished && url === this.pageUrl && !reqresp.getMimeType()) ||
status !== 200
) {
await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
}
}

async directFetchCapture({
Expand Down Expand Up @@ -1404,7 +1413,7 @@ export class Recorder extends EventEmitter {
mime = ct.split(";")[0];
}

const result = !isHTMLMime(mime);
const result = !!mime && !isHTMLMime(mime);

if (result) {
logger.info(
Expand Down
Loading