Skip to content

Commit

Permalink
Support loading array of custom behaviors from URL, file, or dir
Browse files Browse the repository at this point in the history
  • Loading branch information
tw4l committed Oct 21, 2024
1 parent 0d39ea3 commit c9e6d76
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 30 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ RUN ln -s /app/dist/main.js /usr/bin/crawl; \
ln -s /app/dist/main.js /usr/bin/qa; \
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile

RUN mkdir -p /app/behaviors

WORKDIR /crawls

# enable to test custom behaviors build (from browsertrix-behaviors)
Expand Down
14 changes: 7 additions & 7 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import {
runWorkers,
} from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
import { collectAllFileSources, getInfoString } from "./util/file_reader.js";
import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";

import { Browser } from "./util/browser.js";

Expand Down Expand Up @@ -510,7 +510,7 @@ export class Crawler {
}

if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors(
this.customBehaviors = await this.loadCustomBehaviors(
this.params.customBehaviors,
);
}
Expand Down Expand Up @@ -800,24 +800,24 @@ self.__bx_behaviors.selectMainBehavior();
});
}

loadCustomBehaviors(filename: string) {
async loadCustomBehaviors(sources: string[]) {
let str = "";

for (const { contents } of collectAllFileSources(filename, ".js")) {
for (const { contents } of await collectCustomBehaviors(sources)) {
str += `self.__bx_behaviors.load(${contents});\n`;
}

return str;
}

async checkBehaviorScripts(cdp: CDPSession) {
const filename = this.params.customBehaviors;
const sources = this.params.customBehaviors;

if (!filename) {
if (!sources) {
return;
}

for (const { path, contents } of collectAllFileSources(filename, ".js")) {
for (const { path, contents } of await collectCustomBehaviors(sources)) {
await this.browser.checkScript(cdp, path, contents);
}
}
Expand Down
6 changes: 4 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -551,8 +551,10 @@ class ArgParser {

customBehaviors: {
describe:
"injects a custom behavior file or set of behavior files in a directory",
type: "string",
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
"to a directory of behavior files.",
type: "array",
default: [],
},

debugAccessRedis: {
Expand Down
90 changes: 70 additions & 20 deletions src/util/file_reader.ts
Original file line number Diff line number Diff line change
@@ -1,27 +1,78 @@
import fs from "fs";
import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";

import { logger } from "./logger.js";

const MAX_DEPTH = 2;

export function collectAllFileSources(
fileOrDir: string,
ext?: string,
depth = 0,
): { path: string; contents: string }[] {
// Add .ts to allowed extensions when we can support it
const ALLOWED_EXTS = [".js"];

export type FileSource = {
path: string;
contents: string;
};

export type FileSources = FileSource[];

export async function collectCustomBehaviors(sources: string[]) {
const collectedSources: FileSources = [];

for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
const newSources = await collectLocalPathBehaviors(fileSource);
collectedSources.push(...newSources);
}
}

return collectedSources;
}

async function collectOnlineBehavior(url: string) {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;

try {
const res = await fetch(url);
const fileContents = await res.text();
await fsp.writeFile(behaviorFilepath, fileContents);
logger.info(
"Custom behavior file downloaded",
{ url, path: behaviorFilepath },
"behavior",
);
return await collectLocalPathBehaviors(behaviorFilepath);
} catch (e) {
logger.error(
"Error downloading custom behavior from URL",
{ url, error: e },
"behavior",
);
}
return [];
}

async function collectLocalPathBehaviors(fileOrDir: string, depth = 0) {

Check failure on line 60 in src/util/file_reader.ts

View workflow job for this annotation

GitHub Actions / build (20.x)

'collectLocalPathBehaviors' implicitly has return type 'any' because it does not have a return type annotation and is referenced directly or indirectly in one of its return expressions.
const resolvedPath = path.resolve(fileOrDir);

if (depth >= MAX_DEPTH) {
console.warn(
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
loger.warn(

Check failure on line 64 in src/util/file_reader.ts

View workflow job for this annotation

GitHub Actions / build (20.x)

Cannot find name 'loger'. Did you mean 'logger'?
`Max depth of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
{},
"behavior",
);
return [];
}

const stat = fs.statSync(resolvedPath);
const stat = await fsp.stat(resolvedPath);

if (stat.isFile() && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
if (stat.isFile() && ALLOWED_EXTS.includes(path.extname(resolvedPath))) {
const contents = await fsp.readFile(resolvedPath);
return [
{
path: resolvedPath,
Expand All @@ -31,19 +82,18 @@ export function collectAllFileSources(
}

if (stat.isDirectory()) {
const files = fs.readdirSync(resolvedPath);
return files.reduce(
(acc: { path: string; contents: string }[], next: string) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
},
[],
);
const files = await fsp.readdir(resolvedPath);
return files.reduce((acc: FileSources, next: string) => {

Check failure on line 86 in src/util/file_reader.ts

View workflow job for this annotation

GitHub Actions / build (20.x)

Function implicitly has return type 'any' because it does not have a return type annotation and is referenced directly or indirectly in one of its return expressions.
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectLocalPathBehaviors(nextPath, depth + 1)];
}, []);
}

if (depth === 0) {
console.warn(
logger.warn(
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
{},
"behavior",
);
}

Expand Down
26 changes: 25 additions & 1 deletion tests/custom-behavior.test.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import child_process from "child_process";

test("test custom behaviors", async () => {
test("test custom behaviors from local filepath", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
);
Expand Down Expand Up @@ -35,6 +35,30 @@ test("test custom behaviors", async () => {
).toBe(true);
});

test("test custom behavior from URL", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-behaviors/main/dist/behaviors.js --scopeType page --behaviors \"\"");

const log = res.toString();

expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);
});

test("test mixed custom behavior sources", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-behaviors/main/dist/behaviors.js --customBehaviors /custom-behaviors/custom-2.js --scopeType page --behaviors \"\"");

const log = res.toString();

// test custom behavior from url ran
expect(log.indexOf("Custom behavior file downloaded") > 0).toBe(true);

// test custom behavior from local file ran
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});

test("test invalid behavior exit", async () => {
let status = 0;

Expand Down

0 comments on commit c9e6d76

Please sign in to comment.