Skip to content

Commit

Permalink
Merge branch 'main' into recorder-work
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Jul 27, 2023
2 parents e332ee4 + 442f448 commit cf53a51
Show file tree
Hide file tree
Showing 15 changed files with 441 additions and 203 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Options:
--crawlId, --id A user provided ID for this crawl or
crawl configuration (can also be se
t via CRAWL_ID env var)
[string] [default: "97792ef37eaf"]
[string] [default: "7760c6c5f6ca"]
--newContext Deprecated as of 0.8.0, any values p
assed will be ignored
[string] [default: null]
Expand Down Expand Up @@ -176,6 +176,8 @@ Options:
econds) after behaviors before movin
g on to next page
[number] [default: 0]
--dedupPolicy Deduplication policy
[string] [choices: "skip", "revisit", "keep"] [default: "skip"]
--profile Path to tar.gz file which will be ex
tracted and used as the browser prof
ile [string]
Expand Down Expand Up @@ -247,6 +249,9 @@ Options:
[array] [default: []]
--logErrorsToRedis If set, write error messages to redi
s [boolean] [default: false]
--failOnFailedSeed If set, crawler will fail with exit
code 1 if any seed fails
[boolean] [default: false]
--config Path to YAML config file
```
Expand Down
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ proxy:
enable_wombat: true

recorder:
dedup_policy: skip
dedup_policy: ${DEDUP_POLICY}
source_coll: live
cache: always
rollover_size: ${ROLLOVER_SIZE}
Expand Down
57 changes: 33 additions & 24 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ import * as warcio from "warcio";

import { HealthChecker } from "./util/healthcheck.js";
import { TextExtract } from "./util/textextract.js";
import { initStorage, getFileSize, getDirSize, interpolateFilename, getDiskUsage } from "./util/storage.js";
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization } from "./util/storage.js";
import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js";
import { logger, errJSON } from "./util/logger.js";
import { runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources } from "./util/file_reader.js";

import { Browser } from "./util/browser.js";

Expand Down Expand Up @@ -120,6 +121,7 @@ export class Crawler {

this.done = false;

this.customBehaviors = "";
this.behaviorLastLine = null;

this.browser = new Browser();
Expand Down Expand Up @@ -251,6 +253,10 @@ export class Crawler {
}
}

if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors(this.params.customBehaviors);
}

this.headers = {"User-Agent": this.configureUA()};

process.on("exit", () => {
Expand Down Expand Up @@ -336,6 +342,10 @@ export class Crawler {
}
break;

case "error":
logger.error(message, details, "behaviorScript");
break;

case "debug":
default:
logger.debug(message, details, "behaviorScript");
Expand Down Expand Up @@ -388,8 +398,26 @@ export class Crawler {

if (this.params.behaviorOpts) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, page.url(), workerid));
await this.browser.addInitScript(page, behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
await this.browser.addInitScript(page, behaviors);

const initScript = `
self.__bx_behaviors.init(${this.params.behaviorOpts}, false);
${this.customBehaviors}
self.__bx_behaviors.selectMainBehavior();
`;

await this.browser.addInitScript(page, initScript);
}
}

loadCustomBehaviors(filename) {
let str = "";

for (const source of collectAllFileSources(filename, ".js")) {
str += `self.__bx_behaviors.load(${source});\n`;
}

return str;
}

async crawlPage(opts) {
Expand Down Expand Up @@ -604,28 +632,9 @@ export class Crawler {
}

if (this.params.diskUtilization) {
// Check that disk usage isn't already above threshold
const diskUsage = await getDiskUsage();
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
if (usedPercentage >= this.params.diskUtilization) {
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${this.params.diskUtilization}%, stopping`);
interrupt = true;
}

// Check that disk usage isn't likely to cross threshold
const kbUsed = parseInt(diskUsage["Used"]);
const kbTotal = parseInt(diskUsage["1K-blocks"]);
let kbArchiveDirSize = Math.floor(size/1024);
if (this.params.combineWARC && this.params.generateWACZ) {
kbArchiveDirSize *= 4;
} else if (this.params.combineWARC || this.params.generateWACZ) {
kbArchiveDirSize *= 2;
}

const projectedTotal = kbUsed + kbArchiveDirSize;
const projectedUsedPercentage = Math.floor(kbTotal/projectedTotal);
if (projectedUsedPercentage >= this.params.diskUtilization) {
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${this.params.diskUtilization}%, stopping`);
// Check that disk usage isn't already or soon to be above threshold
const diskUtil = await checkDiskUtilization(this.params, size);
if (diskUtil.stop === true) {
interrupt = true;
}
}
Expand Down
8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.10.1",
"version": "0.11.0",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand All @@ -12,14 +12,14 @@
},
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@webrecorder/wabac": "^2.15.3",
"browsertrix-behaviors": "github:webrecorder/browsertrix-behaviors#custom-fetch-header",
"@webrecorder/wabac": "^2.16.6",
"browsertrix-behaviors": "^0.5.1",
"get-folder-size": "^4.0.0",
"ioredis": "^4.27.1",
"js-yaml": "^4.1.0",
"minio": "7.0.26",
"p-queue": "^7.3.4",
"puppeteer-core": "^20.4.0",
"puppeteer-core": "^20.9.0",
"sharp": "^0.32.1",
"sitemapper": "^3.1.2",
"uuid": "8.3.2",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pywb>=2.7.4
uwsgi
wacz>=0.4.8
wacz>=0.4.9
requests[socks]
3 changes: 3 additions & 0 deletions test-setup.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import { jest } from "@jest/globals";

global.jest = jest;
23 changes: 23 additions & 0 deletions tests/custom-behavior.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import child_process from "child_process";

test("test custom behaviors", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page");

const log = res.toString();

// custom behavior ran for example.com
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true);

// but not for example.org
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false);

expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true);

// another custom behavior ran for webrecorder.net
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true);



});


22 changes: 22 additions & 0 deletions tests/custom-behaviors/custom-2.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
class TestBehavior2
{
static init() {
return {
state: {}
};
}

static get id() {
return "TestBehavior2";
}

static isMatch() {
return window.location.origin === "https://webrecorder.net";
}


async* run(ctx) {
ctx.log("In Test Behavior 2!");
yield ctx.Lib.getState(ctx, "test-stat-2");
}
}
22 changes: 22 additions & 0 deletions tests/custom-behaviors/custom.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
class TestBehavior
{
static init() {
return {
state: {}
};
}

static get id() {
return "TestBehavior";
}

static isMatch() {
return window.location.origin === "https://example.com";
}


async* run(ctx) {
ctx.log("In Test Behavior!");
yield ctx.Lib.getState(ctx, "test-stat");
}
}
62 changes: 62 additions & 0 deletions tests/storage.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { calculatePercentageUsed, checkDiskUtilization } from "../util/storage.js";


test("ensure calculatePercentageUsed returns expected values", () => {
expect(calculatePercentageUsed(30, 100)).toEqual(30);

expect(calculatePercentageUsed(1507, 35750)).toEqual(4);

expect(calculatePercentageUsed(33819, 35750)).toEqual(95);

expect(calculatePercentageUsed(140, 70)).toEqual(200);

expect(calculatePercentageUsed(0, 5)).toEqual(0);
});


test("verify end-to-end disk utilization not exceeded threshold", async () => {

const params = {
diskUtilization: 90,
combineWARC: true,
generateWACZ: true
};

const mockDfOutput = `\
Filesystem 1K-blocks Used Available Use% Mounted on
grpcfuse 1000000 285000 715000 28% /crawls`;

// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
// does not exceed 90% threshold
const returnValue = await checkDiskUtilization(params, 5000 * 1024, mockDfOutput);
expect(returnValue).toEqual({
stop: false,
used: 28,
projected: 31,
threshold: 90
});
});


test("verify end-to-end disk utilization exceeds threshold", async () => {

const params = {
diskUtilization: 90,
combineWARC: false,
generateWACZ: true
};

const mockDfOutput = `\
Filesystem 1K-blocks Used Available Use% Mounted on
grpcfuse 100000 85000 15000 85% /crawls`;

// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
// exceeds 90% threshold
const returnValue = await checkDiskUtilization(params, 3000 * 1024, mockDfOutput);
expect(returnValue).toEqual({
stop: true,
used: 85,
projected: 91,
threshold: 90
});
});
14 changes: 13 additions & 1 deletion util/argParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,13 @@ class ArgParser {
type: "number",
},

"dedupPolicy": {
describe: "Deduplication policy",
default: "skip",
type: "string",
choices: ["skip", "revisit", "keep"],
},

"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
Expand Down Expand Up @@ -378,7 +385,12 @@ class ArgParser {
describe: "If set, crawler will fail with exit code 1 if any seed fails",
type: "boolean",
default: false
}
},

"customBehaviors": {
describe: "injects a custom behavior file or set of behavior files in a directory",
type: ["string"]
},
};
}

Expand Down
4 changes: 2 additions & 2 deletions util/browser.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ export class BaseBrowser
await this._init(launchOpts);
}

async setupPage({page, cdp}) {
async setupPage({page}) {
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");

if (this.customProfile) {
logger.info("Disabling Service Workers for profile", {}, "browser");

await cdp.send("Network.setBypassServiceWorker", {bypass: true});
await page.setBypassServiceWorker(true);
}
}

Expand Down
33 changes: 33 additions & 0 deletions util/file_reader.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import fs from "fs";
import path from "path";

const MAX_DEPTH = 2;

export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
const resolvedPath = path.resolve(fileOrDir);

if (depth >= MAX_DEPTH) {
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
return [];
}

const stat = fs.statSync(resolvedPath);

if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
return [`/* src: ${resolvedPath} */\n\n${contents}`];
}

if (stat.isDirectory) {
const files = fs.readdirSync(resolvedPath);
return files.reduce((acc, next) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
}, []);
}

if (depth === 0) {
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
return [];
}
}
Loading

0 comments on commit cf53a51

Please sign in to comment.