From c1ff507a5950706c7ca036b9602aaf4a8dee2b88 Mon Sep 17 00:00:00 2001
From: Adam Lazarus <adamlaz@gmail.com>
Date: Thu, 16 Nov 2023 23:16:46 -0500
Subject: [PATCH 1/4] =?UTF-8?q?=E2=80=9Achore(.gitignore):=20add=20output.?=
 =?UTF-8?q?json=20to=20.gitignore=20to=20prevent=20accidental=20commit=20o?=
 =?UTF-8?q?f=20generated=20output=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 02a7e102..48c8c098 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ node_modules
 apify_storage
 crawlee_storage
 storage
+output.json
\ No newline at end of file

From 4d0808436ccf45e4974b3f682ef2688e1d14e1dd Mon Sep 17 00:00:00 2001
From: Adam Lazarus <adamlaz@gmail.com>
Date: Thu, 16 Nov 2023 23:32:42 -0500
Subject: [PATCH 2/4] style(main.ts): fix trailing whitespace in cookie url
 assignment for better code readability

---
 src/main.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.ts b/src/main.ts
index c217d7cf..fb63ca07 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -24,7 +24,7 @@ if (process.env.NO_CRAWL !== "true") {
         const cookie = {
           name: config.cookie.name,
           value: config.cookie.value,
-          url: request.loadedUrl, 
+          url: request.loadedUrl,
         };
         await page.context().addCookies([cookie]);
       }

From b6ebcce6ff69ac6778c5f4fa93089b24073184ac Mon Sep 17 00:00:00 2001
From: Adam Lazarus <adamlaz@gmail.com>
Date: Thu, 16 Nov 2023 23:33:05 -0500
Subject: [PATCH 3/4] feat(config.ts): add optional 'exclude' field to Config
 type to exclude certain URLs from crawling

---
 config.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/config.ts b/config.ts
index 84c15ba5..5b20aafd 100644
--- a/config.ts
+++ b/config.ts
@@ -5,6 +5,8 @@ type Config = {
   url: string;
   /** Pattern to match against for links on a page to subsequently crawl */
   match: string;
+  /** Optional REGEX to match URLs against for to NOT crawl */
+  exclude?: string;
   /** Selector to grab the inner text from */
   selector: string;
   /** Don't crawl more than this many pages */
@@ -23,6 +25,7 @@ type Config = {
 export const config: Config = {
   url: "https://www.builder.io/c/docs/developers",
   match: "https://www.builder.io/c/docs/**",
+  exclude: "integrate",
   selector: `.docs-builder-container`,
   maxPagesToCrawl: 50,
   outputFileName: "output.json",

From 96ca401b015aaf87d5ed52473b5c06b5929a015f Mon Sep 17 00:00:00 2001
From: Adam Lazarus <adamlaz@gmail.com>
Date: Thu, 16 Nov 2023 23:33:11 -0500
Subject: [PATCH 4/4] feat(main.ts): implement URL exclusion logic in crawling
 process to improve crawling efficiency and avoid unnecessary URLs

---
 src/main.ts | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/main.ts b/src/main.ts
index fb63ca07..9d3a8529 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -4,6 +4,7 @@ import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import { config } from "../config.js";
 import { Page } from "playwright";
+import { URL } from "url";
 
 export function getPageHtml(page: Page) {
   return page.evaluate((selector) => {
@@ -47,10 +48,20 @@ if (process.env.NO_CRAWL !== "true") {
 
       // Extract links from the current page
       // and add them to the crawling queue.
+      const links = await page.$$eval('a', (as) => as.map(a => a.href));
+      const filteredLinks = links.filter(link => {
+        // Check if the link matches the exclude pattern
+        const excludePattern = new RegExp(config.exclude || "");
+        return !excludePattern.test(link);
+      });
+
+      // Enqueue filtered links
       await enqueueLinks({
+        urls: filteredLinks,
         globs: [config.match],
       });
     },
+    
     // Comment this option to scrape the full website.
     maxRequestsPerCrawl: config.maxPagesToCrawl,
     // Uncomment this option to see the browser window.