Skip to content

Commit

Permalink
logging fixes:
Browse files Browse the repository at this point in the history
- avoid duplicate logging for same error, if logging more specific message and rethrowing exception,
set e.detail to "logged" and worker exception handler will not log same error again
- add option to log timeouts as warnings instead of errors
- remove unneed async method in browser, get headers directly
- fix logging in screenshots to include page
  • Loading branch information
ikreymer committed Sep 13, 2023
1 parent 1c486ea commit 82ff0f4
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 19 deletions.
24 changes: 17 additions & 7 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -992,7 +992,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.debug(stdout.join("\n"));
}
if (stderr.length && this.params.logging.includes("debug")) {
logger.error(stderr.join("\n"));
logger.debug(stderr.join("\n"));
}
resolve(code);
});
Expand Down Expand Up @@ -1050,7 +1050,9 @@ self.__bx_behaviors.selectMainBehavior();
this.isHTML(url),
FETCH_TIMEOUT_SECS,
"HEAD request to determine if URL is HTML page timed out",
logDetails
logDetails,
"fetch",
true
);

if (!isHTMLPage) {
Expand All @@ -1059,7 +1061,9 @@ self.__bx_behaviors.selectMainBehavior();
this.directFetchCapture(url),
FETCH_TIMEOUT_SECS,
"Direct fetch capture attempt timed out",
logDetails
logDetails,
"fetch",
true
);
if (captureResult) {
logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
Expand Down Expand Up @@ -1097,12 +1101,12 @@ self.__bx_behaviors.selectMainBehavior();
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
} else {
logger.error("Page Load Error, skipping page", {statusCode, ...logDetails});
throw new Error(`Page ${url} returned status code ${statusCode}`);
logger.error("Non-200 Status Code, skipping page", {statusCode, ...logDetails});
throw new Error("logged");
}
}

const contentType = await this.browser.responseHeader(resp, "content-type");
const contentType = resp.headers()["content-type"];

isHTMLPage = this.isHTMLContentType(contentType);

Expand All @@ -1115,6 +1119,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
e.message = "logged";
throw e;
}
} else {
Expand All @@ -1126,6 +1131,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
e.message = "logged";
throw e;
}
}
Expand Down Expand Up @@ -1267,7 +1273,11 @@ self.__bx_behaviors.selectMainBehavior();

while (await timedRun(
page.$("div.cf-browser-verification.cf-im-under-attack"),
PAGE_OP_TIMEOUT_SECS
PAGE_OP_TIMEOUT_SECS,
"Cloudflare check timed out",
logDetails,
"general",
true
)) {
logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
await sleep(5.5);
Expand Down
4 changes: 0 additions & 4 deletions util/browser.js
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,6 @@ export class Browser extends BaseBrowser
return {page, cdp};
}

async responseHeader(resp, header) {
return await resp.headers()[header];
}

async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) {
const context = await frame.executionContext();
cdp = context._client;
Expand Down
8 changes: 4 additions & 4 deletions util/screenshots.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import path from "path";
import * as warcio from "warcio";
import sharp from "sharp";

import { logger } from "./logger.js";
import { logger, errJSON } from "./logger.js";

// ============================================================================

Expand Down Expand Up @@ -47,7 +47,7 @@ export class Screenshots {
await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
}
}

Expand All @@ -56,8 +56,8 @@ export class Screenshots {
}

async takeThumbnail() {
const screenshotType = "thumbnail";
try {
const screenshotType = "thumbnail";
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options);
Expand All @@ -68,7 +68,7 @@ export class Screenshots {
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error(`Taking screenshot (type: thumbnail) failed for ${this.url}`, e.message);
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
}
}

Expand Down
5 changes: 3 additions & 2 deletions util/timing.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ export function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
}

export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general") {
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
// return Promise return value or log error if timeout is reached first
const timeout = seconds * 1000;

Expand All @@ -17,7 +17,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
.catch((err) => {
if (err == "timeout reached") {
logger.error(message, {"seconds": seconds, ...logDetails}, context);
const logFunc = isWarn ? logger.warn : logger.error;
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
} else {
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
throw err;
Expand Down
6 changes: 4 additions & 2 deletions util/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,16 @@ export class PageWorker
this.crawler.crawlPage(opts),
this.maxPageTime,
"Page Worker Timeout",
{workerid},
this.logDetails,
"worker"
),
this.crashBreak
]);

} catch (e) {
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
if (e.message !== "logged") {
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
}
} finally {
await this.crawler.pageFinished(data);
}
Expand Down

0 comments on commit 82ff0f4

Please sign in to comment.