diff --git a/README.md b/README.md index efee1e6b9..afdf90b82 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ Here's how you can use some of the command-line options to configure the crawl: - To limit the crawl time, set `--timeLimit` (in seconds) +- To limit the crawl to a maximum number of failures, set `--failedLimit` (in number of pages) + - To run more than one browser worker and crawl in parallel, and `--workers N` where N is number of browsers to run in parallel. More browsers will require more CPU and network bandwidth, and does not guarantee faster crawling. - To crawl into a new directory, specify a different name for the `--collection` param, or, if omitted, a new collection directory based on current time will be created. Adding the `--overwrite` flag will delete the collection directory at the start of the crawl, if it exists. @@ -260,6 +262,12 @@ Options: code 1 if any seed fails [boolean] [default: false] --config Path to YAML config file + --pageLoadAttempts How many times the crawler retries t + o load a page if the error is recove + rable + --defaultRetryPause How long the crawler pauses when an + HTTP 429 error is received without ` + Retry-After` header ``` diff --git a/crawler.js b/crawler.js index 695c2a262..4c5ea06a2 100644 --- a/crawler.js +++ b/crawler.js @@ -1172,11 +1172,37 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("Awaiting page load", logDetails); try { - const resp = await page.goto(url, gotoOpts); + let nbAttempts = 0; + let resp = undefined; + while (true) { + resp = await page.goto(url, gotoOpts); + nbAttempts += 1; - // Handle 4xx or 5xx response as a page load error - const statusCode = resp.status(); - if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) { + const statusCode = resp.status(); + + // If code is below 400, the page loaded successfully (3xx not supposed to happen here) + if (statusCode < 400) { + break; + } + + // HTTP 429, let's make a pause (even if max attempts has been reached, to not overload the website) + if (statusCode === 429) { + const retryAfterStr = resp.headers()["retry-after"]; + if (retryAfterStr != null) { + const retryAfterInt = Number.isInteger(retryAfterStr) ? parseInt(retryAfterStr): Math.ceil((Date.parse(retryAfterStr) - Date.now()) / 1000); + logger.warn("HTTP 429 with Retry-After, waiting", {retryAfterInt, ...logDetails}); + await sleep(retryAfterInt); + } else { + logger.warn("HTTP 429 without Retry-After, waiting", {...logDetails}); + await sleep(this.params.defaultRetryPause); + } + if (nbAttempts < this.params.pageLoadAttempts) { + // Retry if we have attempts left + continue; + } + } + + // Handle 4xx or 5xx response as a page load error if (failCrawlOnError) { logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails}); } else { diff --git a/util/argParser.js b/util/argParser.js index 3e88ada60..a820bdadd 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -408,6 +408,18 @@ class ArgParser { describe: "injects a custom behavior file or set of behavior files in a directory", type: ["string"] }, + + "pageLoadAttempts": { + describe: "How many time the crawler retries to load a page if the error is recoverable", + type: "number", + default: 2, + }, + + "defaultRetryPause": { + describe: "How long the crawler pauses when an HTTP 429 error is received without `Retry-After` header", + type: "number", + default: 60, + }, }; }