From 3c2f5f89344db20d81520574a1d6e794f7f6e122 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Aug 2023 13:42:14 -0700 Subject: [PATCH] link extraction optimization: for scopeType page, set depth == extraHops to avoid getting links (#364) if we know no additional links wil be used --- crawler.js | 1 + package.json | 2 +- util/seeds.js | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/crawler.js b/crawler.js index 40d001d68..0b9d8af84 100644 --- a/crawler.js +++ b/crawler.js @@ -1142,6 +1142,7 @@ self.__bx_behaviors.selectMainBehavior(); // skip extraction if at max depth if (seed.isAtMaxDepth(depth) || !selectorOptsList) { + logger.debug("Skipping Link Extraction, At Max Depth"); return; } diff --git a/package.json b/package.json index 5c8316ea3..a67d945d5 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.10.4", + "version": "0.10.5", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/util/seeds.js b/util/seeds.js index 081b9c3e8..d8ddc814e 100644 --- a/util/seeds.js +++ b/util/seeds.js @@ -23,6 +23,12 @@ export class ScopedSeed this.include = [...include, ...this.include]; } + // for page scope, the depth is set to extraHops, as no other + // crawling is done + if (this.scopeType === "page") { + depth = extraHops; + } + this.sitemap = this.resolveSiteMap(sitemap); this.allowHash = allowHash; this.maxExtraHops = extraHops;