Skip to content

Commit

Permalink
link extraction optimization: for scopeType page, set depth == extraH…
Browse files Browse the repository at this point in the history
…ops to avoid getting links (#364)

if we know no additional links wil be used
  • Loading branch information
ikreymer authored Aug 31, 2023
1 parent cf404ef commit 3c2f5f8
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 1 deletion.
1 change: 1 addition & 0 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -1142,6 +1142,7 @@ self.__bx_behaviors.selectMainBehavior();

// skip extraction if at max depth
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
logger.debug("Skipping Link Extraction, At Max Depth");
return;
}

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.10.4",
"version": "0.10.5",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
6 changes: 6 additions & 0 deletions util/seeds.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ export class ScopedSeed
this.include = [...include, ...this.include];
}

// for page scope, the depth is set to extraHops, as no other
// crawling is done
if (this.scopeType === "page") {
depth = extraHops;
}

this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;
Expand Down

0 comments on commit 3c2f5f8

Please sign in to comment.