Skip to content

Commit

Permalink
various fixes regarding state restart: (#370)
Browse files Browse the repository at this point in the history
* additional fixes:
- use distinct exit code for subsequent interrupt (13) and fatal interrupt (17)
- if crawl has been stopped, mark for final exit for post crawl tasks
- stopped takes precedence over interrupted: if both, still exit with 0 (and marked for final exit)
- if no warcs found, crawl stopped, but previous pages found, don't consider failed!
- cleanup: remove unused code, rename to gracefulFinishOnInterrupt, separate from graceful finish via crawl stopped
  • Loading branch information
ikreymer authored Sep 13, 2023
1 parent 5bd4fed commit a3cfc55
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 31 deletions.
39 changes: 22 additions & 17 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,10 @@ export class Crawler {
try {
await this.crawl();
const finished = await this.crawlState.isFinished();
if (this.interrupted && !finished) {
const stopped = await this.crawlState.isCrawlStopped();
if (stopped) {
status = "crawl gracefully stopped";
} else if (this.interrupted && !finished) {
status = "interrupted";
exitCode = 11;
}
Expand Down Expand Up @@ -671,33 +674,22 @@ self.__bx_behaviors.selectMainBehavior();

if (interrupt) {
this.uploadAndDeleteLocal = true;
this.gracefulFinish();
this.gracefulFinishOnInterrupt();
}
}

gracefulFinish() {
gracefulFinishOnInterrupt() {
this.interrupted = true;
logger.info("Crawler interrupted, gracefully finishing current pages");
if (!this.params.waitOnDone) {
this.finalExit = true;
}
}

prepareForExit(markDone = true) {
if (!markDone) {
this.params.waitOnDone = false;
this.uploadAndDeleteLocal = true;
logger.info("SIGNAL: Preparing for exit of this crawler instance only");
} else {
logger.info("SIGNAL: Preparing for final exit of all crawlers");
if (!this.params.waitOnDone && !this.params.restartsOnError) {
this.finalExit = true;
}
}

async serializeAndExit() {
await this.serializeConfig();
await this.closeLog();
process.exit(0);
process.exit(this.interrupted ? 13 : 0);
}

async isCrawlRunning() {
Expand All @@ -706,6 +698,7 @@ self.__bx_behaviors.selectMainBehavior();
}

if (await this.crawlState.isCrawlStopped()) {
logger.info("Crawler is stopped");
return false;
}

Expand Down Expand Up @@ -761,6 +754,7 @@ self.__bx_behaviors.selectMainBehavior();
return;
} else if (await this.crawlState.isCrawlStopped()) {
logger.info("crawl stopped, running post-crawl tasks");
this.finalExit = true;
await this.postCrawl();
return;
}
Expand Down Expand Up @@ -828,6 +822,11 @@ self.__bx_behaviors.selectMainBehavior();
// extra wait for all resources to land into WARCs
await this.awaitPendingClear();

// if crawl has been stopped, mark as final exit for post-crawl tasks
if (await this.crawlState.isCrawlStopped()) {
this.finalExit = true;
}

await this.postCrawl();
}

Expand Down Expand Up @@ -903,10 +902,16 @@ self.__bx_behaviors.selectMainBehavior();
if (isFinished) {
return;
}
// if stopped, won't get anymore data, so consider failed
// if stopped, won't get anymore data
if (await this.crawlState.isCrawlStopped()) {
// possibly restarted after committing, so assume done here!
if ((await this.crawlState.numDone()) > 0) {
return;
}
// stopped and no done pages, mark crawl as failed
await this.crawlState.setStatus("failed");
}
// fail for now, may restart to try again
logger.fatal("No WARC Files, assuming crawl failed");
}

Expand Down
14 changes: 1 addition & 13 deletions main.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ async function handleTerminate(signame) {
try {
if (!crawler.interrupted) {
logger.info("SIGNAL: gracefully finishing current pages...");
crawler.gracefulFinish();
crawler.gracefulFinishOnInterrupt();

} else if (forceTerm || (Date.now() - lastSigInt) > 200) {
logger.info("SIGNAL: stopping crawl now...");
Expand All @@ -49,18 +49,6 @@ process.on("SIGABRT", async () => {
forceTerm = true;
});

process.on("SIGUSR1", () => {
if (crawler) {
crawler.prepareForExit(true);
}
});

process.on("SIGUSR2", () => {
if (crawler) {
crawler.prepareForExit(false);
}
});

crawler = new Crawler();
crawler.run();

Expand Down
6 changes: 6 additions & 0 deletions util/argParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,12 @@ class ArgParser {
default: false
},

"restartsOnError": {
describe: "if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
type: "boolean",
default: false
},

"netIdleWait": {
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
type: "number",
Expand Down
2 changes: 1 addition & 1 deletion util/logger.js
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class Logger
}
}

fatal(message, data={}, context="general", exitCode=1) {
fatal(message, data={}, context="general", exitCode=17) {
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
process.exit(exitCode);
}
Expand Down

0 comments on commit a3cfc55

Please sign in to comment.