Skip to content

Commit

Permalink
improve crawl stopped check with unified isCrawlRunning() check with …
Browse files Browse the repository at this point in the history
…checks both interrupted + redis-based state (#356)

- handle browser crash -- if getting new page fails after 5 tries, assume browser crashed and exit
- check if timedRun() returns a non-null value before expanding
- update timedRun() to rethrow any non-timeout exception, instead of just logging 'unknown exception', as it should be handled downstream.
  • Loading branch information
ikreymer authored Aug 22, 2023
1 parent 212bff0 commit cf404ef
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 6 deletions.
12 changes: 12 additions & 0 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,18 @@ self.__bx_behaviors.selectMainBehavior();
process.exit(0);
}

async isCrawlRunning() {
if (this.interrupted) {
return false;
}

if (await this.crawlState.isCrawlStopped()) {
return false;
}

return true;
}

async crawl() {
if (this.params.healthCheckPort) {
this.healthChecker = new HealthChecker(this.params.healthCheckPort, this.params.workers);
Expand Down
5 changes: 3 additions & 2 deletions util/timing.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { logger, errJSON } from "./logger.js";
import { logger } from "./logger.js";

export function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
Expand All @@ -19,7 +19,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
if (err == "timeout reached") {
logger.error(message, {"seconds": seconds, ...logDetails}, context);
} else {
logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
throw err;
}
});
}
Expand Down
22 changes: 18 additions & 4 deletions util/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,25 @@ export class PageWorker
this.reuseCount = 1;
const workerid = this.id;

while (true) {
let retry = 0;

while (await this.crawler.isCrawlRunning()) {
try {
logger.debug("Getting page in new window", {workerid}, "worker");
const { page, cdp } = await timedRun(
const result = await timedRun(
this.crawler.browser.newWindowPageWithCDP(),
NEW_WINDOW_TIMEOUT,
"New Window Timed Out",
{workerid},
"worker"
);

if (!result) {
throw new Error("timed out");
}

const { page, cdp } = result;

this.page = page;
this.cdp = cdp;
this.opts = {page: this.page, cdp: this.cdp, workerid};
Expand All @@ -128,8 +136,14 @@ export class PageWorker

} catch (err) {
logger.warn("Error getting new page", {"workerid": this.id, ...errJSON(err)}, "worker");
retry++;

if (retry >= MAX_REUSE) {
logger.fatal("Unable to get new page, browser likely crashed");
}

await sleep(0.5);
logger.warn("Retry getting new page");
logger.warn("Retrying getting new page");

if (this.crawler.healthChecker) {
this.crawler.healthChecker.incError();
Expand Down Expand Up @@ -180,7 +194,7 @@ export class PageWorker
async runLoop() {
const crawlState = this.crawler.crawlState;

while (!this.crawler.interrupted && !await crawlState.isCrawlStopped()) {
while (await this.crawler.isCrawlRunning()) {
const data = await crawlState.nextFromQueue();

// see if any work data in the queue
Expand Down

0 comments on commit cf404ef

Please sign in to comment.