Skip to content

Commit

Permalink
Set new logic for invalid seeds (#395)
Browse files Browse the repository at this point in the history
Allow for some seeds to be invalid unless failOnFailedSeed is set

Fail crawl if not valid seeds are provided

Co-authored-by: Ilya Kreymer <[email protected]>
  • Loading branch information
tw4l and ikreymer authored Sep 29, 2023
1 parent 18dce95 commit 7e03dc0
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 2 deletions.
38 changes: 38 additions & 0 deletions tests/seeds.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import util from "util";
import {exec as execCallback } from "child_process";

const exec = util.promisify(execCallback);


test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => {
let passed = true;
try {
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed");
} catch (error) {
console.log(error);
passed = false;
}
expect(passed).toBe(true);
});

test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => {
let passed = true;
try {
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed");
}
catch (error) {
passed = false;
}
expect(passed).toBe(false);
});

test("ensure crawl fails if no valid seeds are passed", async () => {
let passed = true;
try {
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds");
}
catch (error) {
passed = false;
}
expect(passed).toBe(false);
});
13 changes: 12 additions & 1 deletion util/argParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,18 @@ class ArgParser {
if (typeof(seed) === "string") {
seed = {url: seed};
}
argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed}));

try {
argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed}));
} catch (e) {
if (argv.failOnFailedSeed) {
logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`);
}
}
}

if (!argv.scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl.");
}

// Resolve statsFilename
Expand Down
2 changes: 1 addition & 1 deletion util/seeds.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export class ScopedSeed
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
const parsedUrl = this.parseUrl(url);
if (!parsedUrl) {
logger.fatal(`Invalid Seed "${url}" specified, aborting crawl.`);
throw new Error("Invalid URL");
}
this.url = parsedUrl.href;
this.include = this.parseRx(include);
Expand Down

0 comments on commit 7e03dc0

Please sign in to comment.