diff --git a/crawler.js b/crawler.js index 40d001d68..0b9d8af84 100644 --- a/crawler.js +++ b/crawler.js @@ -1142,6 +1142,7 @@ self.__bx_behaviors.selectMainBehavior(); // skip extraction if at max depth if (seed.isAtMaxDepth(depth) || !selectorOptsList) { + logger.debug("Skipping Link Extraction, At Max Depth"); return; } diff --git a/package.json b/package.json index 5c8316ea3..a67d945d5 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.10.4", + "version": "0.10.5", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/util/seeds.js b/util/seeds.js index 081b9c3e8..d8ddc814e 100644 --- a/util/seeds.js +++ b/util/seeds.js @@ -23,6 +23,12 @@ export class ScopedSeed this.include = [...include, ...this.include]; } + // for page scope, the depth is set to extraHops, as no other + // crawling is done + if (this.scopeType === "page") { + depth = extraHops; + } + this.sitemap = this.resolveSiteMap(sitemap); this.allowHash = allowHash; this.maxExtraHops = extraHops;