link extraction optimization: for scopeType page, set depth == extraH…

…ops to avoid getting links (#364) if we know no additional links wil be used
webrecorder · Aug 31, 2023 · 3c2f5f8 · 3c2f5f8
1 parent cf404ef
commit 3c2f5f8
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 1 deletion.
diff --git a/crawler.js b/crawler.js
@@ -1142,6 +1142,7 @@ self.__bx_behaviors.selectMainBehavior();
 
     // skip extraction if at max depth
     if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
+      logger.debug("Skipping Link Extraction, At Max Depth");
       return;
     }
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "browsertrix-crawler",
-  "version": "0.10.4",
+  "version": "0.10.5",
   "main": "browsertrix-crawler",
   "type": "module",
   "repository": "https://github.com/webrecorder/browsertrix-crawler",

diff --git a/util/seeds.js b/util/seeds.js
@@ -23,6 +23,12 @@ export class ScopedSeed
       this.include = [...include, ...this.include];
     }
 
+    // for page scope, the depth is set to extraHops, as no other
+    // crawling is done
+    if (this.scopeType === "page") {
+      depth = extraHops;
+    }
+
     this.sitemap = this.resolveSiteMap(sitemap);
     this.allowHash = allowHash;
     this.maxExtraHops = extraHops;