diff --git a/crawler.js b/crawler.js index d19546c14..40d001d68 100644 --- a/crawler.js +++ b/crawler.js @@ -679,6 +679,18 @@ self.__bx_behaviors.selectMainBehavior(); process.exit(0); } + async isCrawlRunning() { + if (this.interrupted) { + return false; + } + + if (await this.crawlState.isCrawlStopped()) { + return false; + } + + return true; + } + async crawl() { if (this.params.healthCheckPort) { this.healthChecker = new HealthChecker(this.params.healthCheckPort, this.params.workers); diff --git a/util/timing.js b/util/timing.js index ddda4ea73..926d0402a 100644 --- a/util/timing.js +++ b/util/timing.js @@ -1,4 +1,4 @@ -import { logger, errJSON } from "./logger.js"; +import { logger } from "./logger.js"; export function sleep(seconds) { return new Promise(resolve => setTimeout(resolve, seconds * 1000)); @@ -19,7 +19,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai if (err == "timeout reached") { logger.error(message, {"seconds": seconds, ...logDetails}, context); } else { - logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context); + //logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context); + throw err; } }); } diff --git a/util/worker.js b/util/worker.js index ae3c4484c..2f61b754f 100644 --- a/util/worker.js +++ b/util/worker.js @@ -94,10 +94,12 @@ export class PageWorker this.reuseCount = 1; const workerid = this.id; - while (true) { + let retry = 0; + + while (await this.crawler.isCrawlRunning()) { try { logger.debug("Getting page in new window", {workerid}, "worker"); - const { page, cdp } = await timedRun( + const result = await timedRun( this.crawler.browser.newWindowPageWithCDP(), NEW_WINDOW_TIMEOUT, "New Window Timed Out", @@ -105,6 +107,12 @@ export class PageWorker "worker" ); + if (!result) { + throw new Error("timed out"); + } + + const { page, cdp } = result; + this.page = page; this.cdp = cdp; this.opts = {page: this.page, cdp: this.cdp, workerid}; @@ -128,8 +136,14 @@ export class PageWorker } catch (err) { logger.warn("Error getting new page", {"workerid": this.id, ...errJSON(err)}, "worker"); + retry++; + + if (retry >= MAX_REUSE) { + logger.fatal("Unable to get new page, browser likely crashed"); + } + await sleep(0.5); - logger.warn("Retry getting new page"); + logger.warn("Retrying getting new page"); if (this.crawler.healthChecker) { this.crawler.healthChecker.incError(); @@ -180,7 +194,7 @@ export class PageWorker async runLoop() { const crawlState = this.crawler.crawlState; - while (!this.crawler.interrupted && !await crawlState.isCrawlStopped()) { + while (await this.crawler.isCrawlRunning()) { const data = await crawlState.nextFromQueue(); // see if any work data in the queue