From 7e03dc076f576f4d950c4dcc2b72cc6727373962 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 29 Sep 2023 13:02:52 -0400 Subject: [PATCH] Set new logic for invalid seeds (#395) Allow for some seeds to be invalid unless failOnFailedSeed is set Fail crawl if not valid seeds are provided Co-authored-by: Ilya Kreymer --- tests/seeds.test.js | 38 ++++++++++++++++++++++++++++++++++++++ util/argParser.js | 13 ++++++++++++- util/seeds.js | 2 +- 3 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 tests/seeds.test.js diff --git a/tests/seeds.test.js b/tests/seeds.test.js new file mode 100644 index 000000000..77f39aa93 --- /dev/null +++ b/tests/seeds.test.js @@ -0,0 +1,38 @@ +import util from "util"; +import {exec as execCallback } from "child_process"; + +const exec = util.promisify(execCallback); + + +test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => { + let passed = true; + try { + await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed"); + } catch (error) { + console.log(error); + passed = false; + } + expect(passed).toBe(true); +}); + +test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => { + let passed = true; + try { + await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed"); + } + catch (error) { + passed = false; + } + expect(passed).toBe(false); +}); + +test("ensure crawl fails if no valid seeds are passed", async () => { + let passed = true; + try { + await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds"); + } + catch (error) { + passed = false; + } + expect(passed).toBe(false); +}); diff --git a/util/argParser.js b/util/argParser.js index 482b6a25b..8d6ef6700 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -537,7 +537,18 @@ class ArgParser { if (typeof(seed) === "string") { seed = {url: seed}; } - argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed})); + + try { + argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed})); + } catch (e) { + if (argv.failOnFailedSeed) { + logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`); + } + } + } + + if (!argv.scopedSeeds.length) { + logger.fatal("No valid seeds specified, aborting crawl."); } // Resolve statsFilename diff --git a/util/seeds.js b/util/seeds.js index d8ddc814e..4ad1540a0 100644 --- a/util/seeds.js +++ b/util/seeds.js @@ -7,7 +7,7 @@ export class ScopedSeed constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) { const parsedUrl = this.parseUrl(url); if (!parsedUrl) { - logger.fatal(`Invalid Seed "${url}" specified, aborting crawl.`); + throw new Error("Invalid URL"); } this.url = parsedUrl.href; this.include = this.parseRx(include);