diff --git a/bin/cli.js b/bin/cli.js index 1b3caaf..3608e09 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -61,6 +61,9 @@ program.command('create') 'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' + 'with --pages, since using this option will skip the step required to generate a ' + 'pages.jsonl file.') + .option( + '-l --log-directory ', + 'Path to a directory of log files to copy into WACZ.') .action(async (name, options, command) => { /** @type {Object} */ const values = options._optionValues @@ -111,8 +114,9 @@ program.command('create') description: values?.desc, signingUrl: values?.signingUrl, signingToken: values?.signingToken, - pages: values?.pages, - cdxj: values?.cdxj, + pagesDir: values?.pages, + cdxjDir: values?.cdxj, + logDir: values?.logDirectory, log }) } catch (err) { diff --git a/constants.js b/constants.js index ddd4fff..fb58e82 100644 --- a/constants.js +++ b/constants.js @@ -40,6 +40,18 @@ export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.js */ export const CDXJ_DIR_FIXTURES_PATH = `${FIXTURES_PATH}cdxj${sep}` +/** + * Path to the fixtures folder log directory sub-directory. + * @constant + */ +export const LOG_DIR_FIXTURES_PATH = `${FIXTURES_PATH}logs${sep}` + +/** + * Path to the log.txt fixture + * @constant + */ +export const LOG_FILE_FIXTURE_PATH = `${LOG_DIR_FIXTURES_PATH}sample.log` + /** * Colors scheme for log level. * @constant diff --git a/fixtures/logs/invalid.md b/fixtures/logs/invalid.md new file mode 100644 index 0000000..83d666d --- /dev/null +++ b/fixtures/logs/invalid.md @@ -0,0 +1,3 @@ +# Markdown file + +This shouldn't be copied into the WACZ due to file extension. diff --git a/fixtures/logs/sample.log b/fixtures/logs/sample.log new file mode 100644 index 0000000..2717cc6 --- /dev/null +++ b/fixtures/logs/sample.log @@ -0,0 +1,20 @@ +{"timestamp":"2024-08-13T19:53:20.782Z","logLevel":"info","context":"general","message":"Browsertrix-Crawler 1.2.6 (with warcio.js 2.2.1)","details":{}} +{"timestamp":"2024-08-13T19:53:20.784Z","logLevel":"info","context":"general","message":"Seeds","details":[{"url":"https://webrecorder.net/","scopeType":"prefix","include":["/^https?:\\/\\/webrecorder\\.net\\//"],"exclude":[],"allowHash":false,"depth":-1,"sitemap":null,"auth":null,"_authEncoded":null,"maxExtraHops":0,"maxDepth":1000000}]} +{"timestamp":"2024-08-13T19:53:20.785Z","logLevel":"info","context":"general","message":"Behavior Options","details":{"message":"{\"autoplay\":true,\"autofetch\":true,\"autoscroll\":true,\"siteSpecific\":true,\"log\":\"__bx_log\",\"startEarly\":true}"}} +{"timestamp":"2024-08-13T19:53:22.417Z","logLevel":"info","context":"worker","message":"Creating 1 workers","details":{}} +{"timestamp":"2024-08-13T19:53:22.419Z","logLevel":"info","context":"worker","message":"Worker starting","details":{"workerid":0}} +{"timestamp":"2024-08-13T19:53:22.654Z","logLevel":"info","context":"worker","message":"Starting page","details":{"workerid":0,"page":"https://webrecorder.net/"}} +{"timestamp":"2024-08-13T19:53:22.655Z","logLevel":"info","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":0,"total":1,"pending":1,"failed":0,"limit":{"max":1,"hit":false},"pendingPages":["{\"seedId\":0,\"started\":\"2024-08-13T19:53:22.423Z\",\"extraHops\":0,\"url\":\"https:\\/\\/webrecorder.net\\/\",\"added\":\"2024-08-13T19:53:21.426Z\",\"depth\":0}"]}} +{"timestamp":"2024-08-13T19:53:22.928Z","logLevel":"info","context":"general","message":"Awaiting page load","details":{"page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:25.176Z","logLevel":"warn","context":"general","message":"Invalid Page - URL must start with http:// or https://","details":{"url":"mailto:info@webrecorder.net","page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:25.190Z","logLevel":"info","context":"behavior","message":"Running behaviors","details":{"frames":1,"frameUrls":["https://webrecorder.net/"],"page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:25.191Z","logLevel":"info","context":"behavior","message":"Run Script Started","details":{"frameUrl":"https://webrecorder.net/","page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:25.200Z","logLevel":"info","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:25.200Z","logLevel":"info","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!","page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:25.702Z","logLevel":"info","context":"behavior","message":"Run Script Finished","details":{"frameUrl":"https://webrecorder.net/","page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:25.703Z","logLevel":"info","context":"behavior","message":"Behaviors finished","details":{"finished":1,"page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:26.716Z","logLevel":"info","context":"pageStatus","message":"Page Finished","details":{"loadState":4,"page":"https://webrecorder.net/","workerid":0}} +{"timestamp":"2024-08-13T19:53:26.734Z","logLevel":"info","context":"worker","message":"Worker done, all tasks complete","details":{"workerid":0}} +{"timestamp":"2024-08-13T19:53:26.829Z","logLevel":"info","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":1,"total":1,"pending":0,"failed":0,"limit":{"max":1,"hit":true},"pendingPages":[]}} +{"timestamp":"2024-08-13T19:53:26.830Z","logLevel":"info","context":"general","message":"Crawling done","details":{}} +{"timestamp":"2024-08-13T19:53:26.831Z","logLevel":"info","context":"general","message":"Exiting, Crawl status: done","details":{}} diff --git a/index.js b/index.js index fbf80fe..b78ab4a 100644 --- a/index.js +++ b/index.js @@ -191,6 +191,12 @@ export class WACZ { */ cdxjDir = null + /** + * Path to directory of log files to copy into WACZ. + * @type {?string} + */ + logDir = null + /** * @param {WACZOptions} options - See {@link WACZOptions} for details. */ @@ -294,15 +300,15 @@ export class WACZ { this.indexFromWARCs = false } - if (options?.pages) { + if (options?.pagesDir) { this.detectPages = false - this.pagesDir = String(options?.pages).trim() + this.pagesDir = String(options?.pagesDir).trim() } - if (options?.cdxj) { + if (options?.cdxjDir) { this.detectPages = false this.indexFromWARCs = false // Added here for clarity, but implied by calls to `this.addCDXJ()` - this.cdxjDir = String(options?.cdxj).trim() + this.cdxjDir = String(options?.cdxjDir).trim() } if (options?.url) { @@ -340,6 +346,10 @@ export class WACZ { } } + if (options?.logDir) { + this.logDir = String(options?.logDir).trim() + } + if (options?.signingToken && this.signingUrl) { this.signingToken = String(options.signingToken) } @@ -398,6 +408,11 @@ export class WACZ { info('Writing WARCs to WACZ') await this.writeWARCsToZip() + if (this.logDir) { + info('Writing logs to WACZ') + await this.writeLogsToZip() + } + info('Writing datapackage.json to WACZ') await this.writeDatapackageToZip() @@ -727,6 +742,37 @@ export class WACZ { } } + /** + * Streams all the files listed in `this.logDir` to the output ZIP. + * @returns {Promise} + */ + writeLogsToZip = async () => { + this.stateCheck() + + const { logDir, addFileToZip, log } = this + + const allowedExts = ['log', 'txt'] + + const logFiles = await fs.readdir(logDir) + + for (const logFile of logFiles) { + const logFilepath = resolve(this.logDir, logFile) + + const ext = logFilepath.toLowerCase().split('.').pop() + if (!allowedExts.includes(ext)) { + log.warn(`Skipping log file ${logFile}, not in allowed extensions (txt, log).`) + continue + } + + try { + await addFileToZip(logFilepath, `logs/${logFile}`) + } catch (err) { + log.trace(err) + throw new Error(`An error occurred while writing "${logFile}" to ZIP.`) + } + } + } + /** * Creates `datapackage.json` out of `this.resources` and writes it to ZIP. * @returns {Promise} diff --git a/index.test.js b/index.test.js index 42a5bc3..b15fab5 100644 --- a/index.test.js +++ b/index.test.js @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip' import * as dotenv from 'dotenv' import { WACZ } from './index.js' -import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js' +import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, LOG_DIR_FIXTURES_PATH, LOG_FILE_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js' import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import // Loads env vars from .env if provided @@ -74,8 +74,8 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) = assert.equal(archive.detectPages, false) }) -test('WACZ constructor accounts for options.pages if provided.', async (_t) => { - const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH }) +test('WACZ constructor accounts for options.pagesDir if provided.', async (_t) => { + const archive = new WACZ({ input: FIXTURE_INPUT, pagesDir: PAGES_DIR_FIXTURES_PATH }) assert.equal(archive.detectPages, false) assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH) }) @@ -187,6 +187,11 @@ test('WACZ constructor accounts for options.datapackageExtras if provided.', asy assert.equal(archive.datapackageExtras, datapackageExtras) }) +test('WACZ constructor accounts for options.logDir if valid.', async (_t) => { + const archive = new WACZ({ input: FIXTURE_INPUT, logDir: LOG_DIR_FIXTURES_PATH }) + assert.equal(archive.logDir, LOG_DIR_FIXTURES_PATH) +}) + test('addPage adds entry to pagesTree and turns detectPages off.', async (_t) => { const archive = new WACZ({ input: FIXTURE_INPUT }) assert.equal(archive.detectPages, true) @@ -347,7 +352,8 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f url: 'https://lil.law.harvard.edu', title: 'WACZ Title', description: 'WACZ Description', - pages: PAGES_DIR_FIXTURES_PATH + pagesDir: PAGES_DIR_FIXTURES_PATH, + logDir: LOG_DIR_FIXTURES_PATH } const archive = new WACZ(options) @@ -356,10 +362,11 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line - // File in pages fixture directory that are invalid JSONL or have wrong extension + // Files in fixtures directories that are invalid JSONL or have wrong extensions // should not be copied into the WACZ. assert.rejects(async () => await zip.entryData('pages/invalid.jsonl')) assert.rejects(async () => await zip.entryData('pages/invalid.txt')) + assert.rejects(async () => await zip.entryData('logs/invalid.md')) // pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures // they were copied from. @@ -373,6 +380,11 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH) assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash) + // log file provided in logDir option should have same hash as fixture + const datapackageLogFile = datapackage.resources.filter(entry => entry.path === 'logs/sample.log')[0] + const logFileFixtureHash = await archive.sha256(LOG_FILE_FIXTURE_PATH) + assert.equal(datapackageLogFile.hash, logFileFixtureHash) + // Delete temp file await fs.unlink(options.output) }) @@ -384,8 +396,8 @@ test('WACZ.process with cdxj option creates valid WACZ with index from provided url: 'https://lil.law.harvard.edu', title: 'WACZ Title', description: 'WACZ Description', - pages: PAGES_DIR_FIXTURES_PATH, - cdxj: CDXJ_DIR_FIXTURES_PATH + pagesDir: PAGES_DIR_FIXTURES_PATH, + cdxjDir: CDXJ_DIR_FIXTURES_PATH } const archive = new WACZ(options) diff --git a/types.js b/types.js index 7c025ec..94e7491 100644 --- a/types.js +++ b/types.js @@ -12,8 +12,9 @@ * @property {?string} signingUrl - If set, will be used to try and sign the resulting archive. * @property {?string} signingToken - Access token to be used in combination with `signingUrl`. * @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`. - * @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files. - * @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file. + * @property {?string} cdxjDir - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files. + * @property {?string} pagesDir - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file. + * @property {?string} logDir - If set, allows for passing existing crawler log files into the WACZ. Path to a folder containing log files. Only files with log or txt extensions in that folder will be copied. * @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console. */