From f697027200e75ce21a70620c6c06d5423515cdbe Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 25 Sep 2023 14:41:10 +0200 Subject: [PATCH 1/4] Handle HTTP 429 errors + add failure limit --- README.md | 11 +++++++++++ crawler.js | 42 ++++++++++++++++++++++++++++++++++++++---- util/argParser.js | 18 ++++++++++++++++++ 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3eefcfb0f..73bc5bc88 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ Here's how you can use some of the command-line options to configure the crawl: - To limit the crawl time, set `--timeLimit` (in seconds) +- To limit the crawl to a maximum number of failures, set `--failedLimit` (in number of pages) + - To run more than one browser worker and crawl in parallel, and `--workers N` where N is number of browsers to run in parallel. More browsers will require more CPU and network bandwidth, and does not guarantee faster crawling. - To crawl into a new directory, specify a different name for the `--collection` param, or, if omitted, a new collection directory based on current time will be created. Adding the `--overwrite` flag will delete the collection directory at the start of the crawl, if it exists. @@ -224,6 +226,9 @@ Options: --timeLimit If set, save state and exit after ti me limit, in seconds [number] [default: 0] + --failedLimit If set, save state and exit if numbe + r of failed pages exceeds this value + [number] [default: 0] --healthCheckPort port to run healthcheck on [number] [default: 0] --overwrite overwrite current crawl data: if set @@ -260,6 +265,12 @@ Options: code 1 if any seed fails [boolean] [default: false] --config Path to YAML config file + --pageLoadAttempts How many times the crawler retries t + o load a page if the error is recove + rable + --defaultRetryPause How long the crawler pauses when an + HTTP 429 error is received without ` + Retry-After` header ``` diff --git a/crawler.js b/crawler.js index b915ee80e..06487e03a 100644 --- a/crawler.js +++ b/crawler.js @@ -698,6 +698,14 @@ self.__bx_behaviors.selectMainBehavior(); } } + if (this.params.failedLimit) { + const numFailed = this.crawlState.numFailed(); + if (numFailed >= this.params.failedLimit) { + logger.info(`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, stopping`); + interrupt = true; + } + } + if (interrupt) { this.uploadAndDeleteLocal = true; this.gracefulFinishOnInterrupt(); @@ -1136,11 +1144,37 @@ self.__bx_behaviors.selectMainBehavior(); logger.info("Awaiting page load", logDetails); try { - const resp = await page.goto(url, gotoOpts); + let nbAttempts = 0; + let resp = undefined; + while (true) { + resp = await page.goto(url, gotoOpts); + nbAttempts += 1; + + const statusCode = resp.status(); + + // If code is below 400, the page loaded successfully (3xx not supposed to happen here) + if (statusCode < 400) { + break; + } + + // HTTP 429, let's make a pause (even if max attempts has been reached, to not overload the website) + if (statusCode === 429) { + const retryAfterStr = "Retry-After" in resp.headers() ? resp.headers()["Retry-After"] : undefined; + if (retryAfterStr) { + const retryAfterInt = Number.isInteger(retryAfterStr) ? parseInt(retryAfterStr): Math.ceil((Date.parse(retryAfterStr) - Date.now()) / 1000); + logger.warn("HTTP 429 with Retry-After, waiting", {retryAfterInt, ...logDetails}); + sleep(retryAfterInt); + } else { + logger.warn("HTTP 429 without Retry-After, waiting", {...logDetails}); + sleep(this.params.defaultRetryPause); + } + if (nbAttempts < this.params.pageLoadAttempts) { + // Retry if we have attempts left + continue; + } + } - // Handle 4xx or 5xx response as a page load error - const statusCode = resp.status(); - if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) { + // Handle 4xx or 5xx response as a page load error if (failCrawlOnError) { logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails}); } else { diff --git a/util/argParser.js b/util/argParser.js index 482b6a25b..1a10feeff 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -334,6 +334,12 @@ class ArgParser { default: 0, }, + "failedLimit": { + describe: "If set, save state and exit if number of failed pages exceeds this value", + type: "number", + default: 0, + }, + "healthCheckPort": { describe: "port to run healthcheck on", type: "number", @@ -402,6 +408,18 @@ class ArgParser { describe: "injects a custom behavior file or set of behavior files in a directory", type: ["string"] }, + + "pageLoadAttempts": { + describe: "How many time the crawler retries to load a page if the error is recoverable", + type: "number", + default: 2, + }, + + "defaultRetryPause": { + describe: "How long the crawler pauses when an HTTP 429 error is received without `Retry-After` header", + type: "number", + default: 60, + }, }; } From 2950042c14aae560dd8bddbab41e9e9f69143c54 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 25 Sep 2023 17:42:05 +0200 Subject: [PATCH 2/4] Await promises --- crawler.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crawler.js b/crawler.js index 06487e03a..fcce6fde5 100644 --- a/crawler.js +++ b/crawler.js @@ -1159,14 +1159,14 @@ self.__bx_behaviors.selectMainBehavior(); // HTTP 429, let's make a pause (even if max attempts has been reached, to not overload the website) if (statusCode === 429) { - const retryAfterStr = "Retry-After" in resp.headers() ? resp.headers()["Retry-After"] : undefined; + const retryAfterStr = "retry-after" in resp.headers() ? resp.headers()["retry-after"] : undefined; if (retryAfterStr) { const retryAfterInt = Number.isInteger(retryAfterStr) ? parseInt(retryAfterStr): Math.ceil((Date.parse(retryAfterStr) - Date.now()) / 1000); logger.warn("HTTP 429 with Retry-After, waiting", {retryAfterInt, ...logDetails}); - sleep(retryAfterInt); + await sleep(retryAfterInt); } else { logger.warn("HTTP 429 without Retry-After, waiting", {...logDetails}); - sleep(this.params.defaultRetryPause); + await sleep(this.params.defaultRetryPause); } if (nbAttempts < this.params.pageLoadAttempts) { // Retry if we have attempts left From 5d68cd66f54f4464da1128a5c408936352df4658 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 27 Sep 2023 08:32:18 +0200 Subject: [PATCH 3/4] Simplify retry-after header retrieval --- crawler.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler.js b/crawler.js index fcce6fde5..047d568db 100644 --- a/crawler.js +++ b/crawler.js @@ -1159,8 +1159,8 @@ self.__bx_behaviors.selectMainBehavior(); // HTTP 429, let's make a pause (even if max attempts has been reached, to not overload the website) if (statusCode === 429) { - const retryAfterStr = "retry-after" in resp.headers() ? resp.headers()["retry-after"] : undefined; - if (retryAfterStr) { + const retryAfterStr = resp.headers()["retry-after"]; + if (retryAfterStr != null) { const retryAfterInt = Number.isInteger(retryAfterStr) ? parseInt(retryAfterStr): Math.ceil((Date.parse(retryAfterStr) - Date.now()) / 1000); logger.warn("HTTP 429 with Retry-After, waiting", {retryAfterInt, ...logDetails}); await sleep(retryAfterInt); From c97ee42cb3c975ca58370a93102f1bf98d3b1d1f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 18 Oct 2023 18:14:57 -0700 Subject: [PATCH 4/4] removing fail on options, already added --- README.md | 3 --- util/argParser.js | 6 ------ 2 files changed, 9 deletions(-) diff --git a/README.md b/README.md index 72bd2a81f..afdf90b82 100644 --- a/README.md +++ b/README.md @@ -226,9 +226,6 @@ Options: --timeLimit If set, save state and exit after ti me limit, in seconds [number] [default: 0] - --failedLimit If set, save state and exit if numbe - r of failed pages exceeds this value - [number] [default: 0] --healthCheckPort port to run healthcheck on [number] [default: 0] --overwrite overwrite current crawl data: if set diff --git a/util/argParser.js b/util/argParser.js index b476a0a4f..a820bdadd 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -334,12 +334,6 @@ class ArgParser { default: 0, }, - "failedLimit": { - describe: "If set, save state and exit if number of failed pages exceeds this value", - type: "number", - default: 0, - }, - "healthCheckPort": { describe: "port to run healthcheck on", type: "number",