From c1ff507a5950706c7ca036b9602aaf4a8dee2b88 Mon Sep 17 00:00:00 2001 From: Adam Lazarus Date: Thu, 16 Nov 2023 23:16:46 -0500 Subject: [PATCH 1/4] =?UTF-8?q?=E2=80=9Achore(.gitignore):=20add=20output.?= =?UTF-8?q?json=20to=20.gitignore=20to=20prevent=20accidental=20commit=20o?= =?UTF-8?q?f=20generated=20output=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 02a7e102..48c8c098 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ node_modules apify_storage crawlee_storage storage +output.json \ No newline at end of file From 4d0808436ccf45e4974b3f682ef2688e1d14e1dd Mon Sep 17 00:00:00 2001 From: Adam Lazarus Date: Thu, 16 Nov 2023 23:32:42 -0500 Subject: [PATCH 2/4] style(main.ts): fix trailing whitespace in cookie url assignment for better code readability --- src/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index c217d7cf..fb63ca07 100644 --- a/src/main.ts +++ b/src/main.ts @@ -24,7 +24,7 @@ if (process.env.NO_CRAWL !== "true") { const cookie = { name: config.cookie.name, value: config.cookie.value, - url: request.loadedUrl, + url: request.loadedUrl, }; await page.context().addCookies([cookie]); } From b6ebcce6ff69ac6778c5f4fa93089b24073184ac Mon Sep 17 00:00:00 2001 From: Adam Lazarus Date: Thu, 16 Nov 2023 23:33:05 -0500 Subject: [PATCH 3/4] feat(config.ts): add optional 'exclude' field to Config type to exclude certain URLs from crawling --- config.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config.ts b/config.ts index 84c15ba5..5b20aafd 100644 --- a/config.ts +++ b/config.ts @@ -5,6 +5,8 @@ type Config = { url: string; /** Pattern to match against for links on a page to subsequently crawl */ match: string; + /** Optional REGEX to match URLs against for to NOT crawl */ + exclude?: string; /** Selector to grab the inner text from */ selector: string; /** Don't crawl more than this many pages */ @@ -23,6 +25,7 @@ type Config = { export const config: Config = { url: "https://www.builder.io/c/docs/developers", match: "https://www.builder.io/c/docs/**", + exclude: "integrate", selector: `.docs-builder-container`, maxPagesToCrawl: 50, outputFileName: "output.json", From 96ca401b015aaf87d5ed52473b5c06b5929a015f Mon Sep 17 00:00:00 2001 From: Adam Lazarus Date: Thu, 16 Nov 2023 23:33:11 -0500 Subject: [PATCH 4/4] feat(main.ts): implement URL exclusion logic in crawling process to improve crawling efficiency and avoid unnecessary URLs --- src/main.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/main.ts b/src/main.ts index fb63ca07..9d3a8529 100644 --- a/src/main.ts +++ b/src/main.ts @@ -4,6 +4,7 @@ import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { config } from "../config.js"; import { Page } from "playwright"; +import { URL } from "url"; export function getPageHtml(page: Page) { return page.evaluate((selector) => { @@ -47,10 +48,20 @@ if (process.env.NO_CRAWL !== "true") { // Extract links from the current page // and add them to the crawling queue. + const links = await page.$$eval('a', (as) => as.map(a => a.href)); + const filteredLinks = links.filter(link => { + // Check if the link matches the exclude pattern + const excludePattern = new RegExp(config.exclude || ""); + return !excludePattern.test(link); + }); + + // Enqueue filtered links await enqueueLinks({ + urls: filteredLinks, globs: [config.match], }); }, + // Comment this option to scrape the full website. maxRequestsPerCrawl: config.maxPagesToCrawl, // Uncomment this option to see the browser window.