Skip to content

Commit

Permalink
crawler args typing (#680)
Browse files Browse the repository at this point in the history
- Refactors args parsing so that `Crawler.params` is properly timed with
CLI options + additions with `CrawlerArgs` type.
- also adds typing to create-login-profile CLI options
- validation still done w/o typing due to yargs limitations
- tests: exclude slow page from tests for faster test runs
  • Loading branch information
ikreymer authored Sep 6, 2024
1 parent 802a416 commit 9c9643c
Show file tree
Hide file tree
Showing 14 changed files with 751 additions and 707 deletions.
24 changes: 12 additions & 12 deletions docs/docs/user-guide/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ Options:
tom"]
--scopeIncludeRx, --include Regex of page URLs that should be in
cluded in the crawl (defaults to the
immediate directory of URL)
immediate directory of URL)[string]
--scopeExcludeRx, --exclude Regex of page URLs that should be ex
cluded from the crawl.
cluded from the crawl. [string]
--allowHashUrls Allow Hashtag URLs, useful for singl
e-page-application crawling or when
different hashtags load dynamic cont
Expand All @@ -56,14 +56,14 @@ Options:
an iframe [array] [default: []]
--blockMessage If specified, when a URL is blocked,
a record with this error message is
added instead [string]
added instead[string] [default: ""]
--blockAds, --blockads If set, block advertisements from be
ing loaded (based on Stephen Black's
blocklist)
[boolean] [default: false]
--adBlockMessage If specified, when an ad is blocked,
a record with this error message is
added instead [string]
added instead[string] [default: ""]
-c, --collection Collection name to crawl to (replay
will be accessible under this name i
n pywb preview)
Expand All @@ -79,7 +79,7 @@ Options:
ineWarc [boolean] [default: false]
--rolloverSize If set, declare the rollover size
[number] [default: 1000000000]
--generateWACZ, --generatewacz, --ge If set, generate wacz
--generateWACZ, --generatewacz, --ge If set, generate WACZ on disk
nerateWacz [boolean] [default: false]
--logging Logging options for crawler, can inc
lude: stats (enabled by default), js
Expand All @@ -94,15 +94,15 @@ Options:
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap", "replay", "proxy"] [default: []]
inks", "sitemap", "wacz", "replay", "proxy"] [default: []]
--logExcludeContext Comma-separated list of contexts to
NOT include in logs
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap", "replay", "proxy"] [default: ["recorderNetwork","jsError","s
creencast"]]
inks", "sitemap", "wacz", "replay", "proxy"] [default: ["recorderNetwork","jsE
rror","screencast"]]
--text Extract initial (default) or final t
ext to pages.jsonl or WARC resource
record(s)
Expand All @@ -127,15 +127,15 @@ Options:
those greater than or equal to (>=)
provided ISO Date string (YYYY-MM-D
D or YYYY-MM-DDTHH:MM:SS or partial
date)
date) [string]
--sitemapToDate, --sitemapTo If set, filter URLs from sitemaps to
those less than or equal to (<=) pr
ovided ISO Date string (YYYY-MM-DD o
r YYYY-MM-DDTHH:MM:SS or partial dat
e)
e) [string]
--statsFilename If set, output stats as JSON to this
file. (Relative filename resolves t
o crawl working directory)
o crawl working directory) [string]
--behaviors Which background behaviors to enable
on each page
[array] [choices: "autoplay", "autofetch", "autoscroll", "siteSpecific"] [defa
Expand Down Expand Up @@ -304,7 +304,7 @@ Options:
--shutdownWait Shutdown browser in interactive after this many seco
nds, if no pings received [number] [default: 0]
--profile Path or HTTP(S) URL to tar.gz file which contains th
e browser profile directory [string]
e browser profile directory [string] [default: ""]
--windowSize Browser window dimensions, specified as: width,heigh
t [string] [default: "1360,1020"]
--cookieDays If >0, set all cookies, including session cookies, t
Expand Down
29 changes: 17 additions & 12 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import {
WorkerId,
} from "./util/state.js";

import { parseArgs } from "./util/argParser.js";
import { CrawlerArgs, parseArgs } from "./util/argParser.js";

import yaml from "js-yaml";

Expand Down Expand Up @@ -52,7 +52,7 @@ import {
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
} from "./util/constants.js";

import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
import { OriginOverride } from "./util/originoverride.js";

import {
Expand Down Expand Up @@ -107,8 +107,7 @@ type PageEntry = {

// ============================================================================
export class Crawler {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any;
params: CrawlerArgs;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
origConfig: any;

Expand Down Expand Up @@ -200,8 +199,8 @@ export class Crawler {

constructor() {
const args = this.parseArgs();
this.params = args.parsed;
this.origConfig = args.origConfig;
this.params = args as CrawlerArgs;
this.origConfig = this.params.origConfig;

// root collections dir
this.collDir = path.join(
Expand Down Expand Up @@ -872,7 +871,7 @@ self.__bx_behaviors.selectMainBehavior();

const result = await timedRun(
directFetchCapture({ url, headers, cdp }),
this.params.timeout,
this.params.pageLoadTimeout,
"Direct fetch of page URL timed out",
logDetails,
"fetch",
Expand Down Expand Up @@ -1396,7 +1395,7 @@ self.__bx_behaviors.selectMainBehavior();

if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(
this.params.blockRules,
this.params.blockRules as BlockRuleDecl[],
this.captureBasePrefix,
this.params.blockMessage,
);
Expand All @@ -1405,7 +1404,9 @@ self.__bx_behaviors.selectMainBehavior();
this.screencaster = this.initScreenCaster();

if (this.params.originOverride && this.params.originOverride.length) {
this.originOverride = new OriginOverride(this.params.originOverride);
this.originOverride = new OriginOverride(
this.params.originOverride as string[],
);
}

await this._addInitialSeeds();
Expand Down Expand Up @@ -2183,7 +2184,7 @@ self.__bx_behaviors.selectMainBehavior();
id: "pages",
title,
};
header.hasText = this.params.text.includes("to-pages");
header.hasText = this.params.text.includes("to-pages") + "";
if (this.params.text.length) {
logger.debug("Text Extraction: " + this.params.text.join(","));
} else {
Expand Down Expand Up @@ -2290,8 +2291,12 @@ self.__bx_behaviors.selectMainBehavior();
return;
}

const fromDate = this.params.sitemapFromDate;
const toDate = this.params.sitemapToDate;
const fromDate = this.params.sitemapFromDate
? new Date(this.params.sitemapFromDate)
: undefined;
const toDate = this.params.sitemapToDate
? new Date(this.params.sitemapToDate)
: undefined;
const headers = this.headers;

logger.info(
Expand Down
197 changes: 102 additions & 95 deletions src/create-login-profile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import http, { IncomingMessage, ServerResponse } from "http";
import readline from "readline";
import child_process from "child_process";

import yargs, { Options } from "yargs";
import yargs from "yargs";

import { logger } from "./util/logger.js";

Expand Down Expand Up @@ -35,96 +35,106 @@ const behaviors = fs.readFileSync(
{ encoding: "utf8" },
);

function cliOpts(): { [key: string]: Options } {
return {
url: {
describe: "The URL of the login page",
type: "string",
demandOption: true,
},

user: {
describe:
"The username for the login. If not specified, will be prompted",
},

password: {
describe:
"The password for the login. If not specified, will be prompted (recommended)",
},

filename: {
describe:
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
default: "/crawls/profiles/profile.tar.gz",
},

debugScreenshot: {
describe:
"If specified, take a screenshot after login and save as this filename",
},

headless: {
describe: "Run in headless mode, otherwise start xvfb",
type: "boolean",
default: false,
},

automated: {
describe: "Start in automated mode, no interactive browser",
type: "boolean",
default: false,
},

interactive: {
describe: "Deprecated. Now the default option!",
type: "boolean",
default: false,
},

shutdownWait: {
describe:
"Shutdown browser in interactive after this many seconds, if no pings received",
type: "number",
default: 0,
},

profile: {
describe:
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory",
type: "string",
},

windowSize: {
type: "string",
describe: "Browser window dimensions, specified as: width,height",
default: getDefaultWindowSize(),
},

cookieDays: {
type: "number",
describe:
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
default: 7,
},

proxyServer: {
describe:
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
type: "string",
},

sshProxyPrivateKeyFile: {
describe: "path to SSH private key for SOCKS5 over SSH proxy connection",
type: "string",
},

sshProxyKnownHostsFile: {
describe:
"path to SSH known hosts file for SOCKS5 over SSH proxy connection",
type: "string",
},
};
function initArgs() {
return yargs(process.argv)
.usage("browsertrix-crawler profile [options]")
.options({
url: {
describe: "The URL of the login page",
type: "string",
demandOption: true,
},

user: {
describe:
"The username for the login. If not specified, will be prompted",
type: "string",
},

password: {
describe:
"The password for the login. If not specified, will be prompted (recommended)",
type: "string",
},

filename: {
describe:
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
type: "string",
default: "/crawls/profiles/profile.tar.gz",
},

debugScreenshot: {
describe:
"If specified, take a screenshot after login and save as this filename",
type: "boolean",
default: false,
},

headless: {
describe: "Run in headless mode, otherwise start xvfb",
type: "boolean",
default: false,
},

automated: {
describe: "Start in automated mode, no interactive browser",
type: "boolean",
default: false,
},

interactive: {
describe: "Deprecated. Now the default option!",
type: "boolean",
default: false,
},

shutdownWait: {
describe:
"Shutdown browser in interactive after this many seconds, if no pings received",
type: "number",
default: 0,
},

profile: {
describe:
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory",
type: "string",
default: "",
},

windowSize: {
describe: "Browser window dimensions, specified as: width,height",
type: "string",
default: getDefaultWindowSize(),
},

cookieDays: {
describe:
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
type: "number",
default: 7,
},

proxyServer: {
describe:
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
type: "string",
},

sshProxyPrivateKeyFile: {
describe:
"path to SSH private key for SOCKS5 over SSH proxy connection",
type: "string",
},

sshProxyKnownHostsFile: {
describe:
"path to SSH known hosts file for SOCKS5 over SSH proxy connection",
type: "string",
},
})
.parseSync();
}

function getDefaultWindowSize() {
Expand All @@ -140,10 +150,7 @@ function handleTerminate(signame: string) {
}

async function main() {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const params: any = yargs(process.argv)
.usage("browsertrix-crawler profile [options]")
.option(cliOpts()).argv;
const params = initArgs();

logger.setDebugLogging(true);

Expand Down
Loading

0 comments on commit 9c9643c

Please sign in to comment.