Skip to content

Commit

Permalink
Merge pull request #119 from harvard-lil/issue-118-log-directory
Browse files Browse the repository at this point in the history
Add logDirectory option to copy logs to WACZ
  • Loading branch information
matteocargnelutti authored Aug 28, 2024
2 parents 5014fff + 3ae73b0 commit e7db309
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 15 deletions.
8 changes: 6 additions & 2 deletions bin/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ program.command('create')
'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' +
'with --pages, since using this option will skip the step required to generate a ' +
'pages.jsonl file.')
.option(
'-l --log-directory <string>',
'Path to a directory of log files to copy into WACZ.')
.action(async (name, options, command) => {
/** @type {Object} */
const values = options._optionValues
Expand Down Expand Up @@ -111,8 +114,9 @@ program.command('create')
description: values?.desc,
signingUrl: values?.signingUrl,
signingToken: values?.signingToken,
pages: values?.pages,
cdxj: values?.cdxj,
pagesDir: values?.pages,
cdxjDir: values?.cdxj,
logDir: values?.logDirectory,
log
})
} catch (err) {
Expand Down
12 changes: 12 additions & 0 deletions constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.js
*/
export const CDXJ_DIR_FIXTURES_PATH = `${FIXTURES_PATH}cdxj${sep}`

/**
* Path to the fixtures folder log directory sub-directory.
* @constant
*/
export const LOG_DIR_FIXTURES_PATH = `${FIXTURES_PATH}logs${sep}`

/**
* Path to the log.txt fixture
* @constant
*/
export const LOG_FILE_FIXTURE_PATH = `${LOG_DIR_FIXTURES_PATH}sample.log`

/**
* Colors scheme for log level.
* @constant
Expand Down
3 changes: 3 additions & 0 deletions fixtures/logs/invalid.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Markdown file

This shouldn't be copied into the WACZ due to file extension.
20 changes: 20 additions & 0 deletions fixtures/logs/sample.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{"timestamp":"2024-08-13T19:53:20.782Z","logLevel":"info","context":"general","message":"Browsertrix-Crawler 1.2.6 (with warcio.js 2.2.1)","details":{}}
{"timestamp":"2024-08-13T19:53:20.784Z","logLevel":"info","context":"general","message":"Seeds","details":[{"url":"https://webrecorder.net/","scopeType":"prefix","include":["/^https?:\\/\\/webrecorder\\.net\\//"],"exclude":[],"allowHash":false,"depth":-1,"sitemap":null,"auth":null,"_authEncoded":null,"maxExtraHops":0,"maxDepth":1000000}]}
{"timestamp":"2024-08-13T19:53:20.785Z","logLevel":"info","context":"general","message":"Behavior Options","details":{"message":"{\"autoplay\":true,\"autofetch\":true,\"autoscroll\":true,\"siteSpecific\":true,\"log\":\"__bx_log\",\"startEarly\":true}"}}
{"timestamp":"2024-08-13T19:53:22.417Z","logLevel":"info","context":"worker","message":"Creating 1 workers","details":{}}
{"timestamp":"2024-08-13T19:53:22.419Z","logLevel":"info","context":"worker","message":"Worker starting","details":{"workerid":0}}
{"timestamp":"2024-08-13T19:53:22.654Z","logLevel":"info","context":"worker","message":"Starting page","details":{"workerid":0,"page":"https://webrecorder.net/"}}
{"timestamp":"2024-08-13T19:53:22.655Z","logLevel":"info","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":0,"total":1,"pending":1,"failed":0,"limit":{"max":1,"hit":false},"pendingPages":["{\"seedId\":0,\"started\":\"2024-08-13T19:53:22.423Z\",\"extraHops\":0,\"url\":\"https:\\/\\/webrecorder.net\\/\",\"added\":\"2024-08-13T19:53:21.426Z\",\"depth\":0}"]}}
{"timestamp":"2024-08-13T19:53:22.928Z","logLevel":"info","context":"general","message":"Awaiting page load","details":{"page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:25.176Z","logLevel":"warn","context":"general","message":"Invalid Page - URL must start with http:// or https://","details":{"url":"mailto:[email protected]","page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:25.190Z","logLevel":"info","context":"behavior","message":"Running behaviors","details":{"frames":1,"frameUrls":["https://webrecorder.net/"],"page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:25.191Z","logLevel":"info","context":"behavior","message":"Run Script Started","details":{"frameUrl":"https://webrecorder.net/","page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:25.200Z","logLevel":"info","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:25.200Z","logLevel":"info","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!","page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:25.702Z","logLevel":"info","context":"behavior","message":"Run Script Finished","details":{"frameUrl":"https://webrecorder.net/","page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:25.703Z","logLevel":"info","context":"behavior","message":"Behaviors finished","details":{"finished":1,"page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:26.716Z","logLevel":"info","context":"pageStatus","message":"Page Finished","details":{"loadState":4,"page":"https://webrecorder.net/","workerid":0}}
{"timestamp":"2024-08-13T19:53:26.734Z","logLevel":"info","context":"worker","message":"Worker done, all tasks complete","details":{"workerid":0}}
{"timestamp":"2024-08-13T19:53:26.829Z","logLevel":"info","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":1,"total":1,"pending":0,"failed":0,"limit":{"max":1,"hit":true},"pendingPages":[]}}
{"timestamp":"2024-08-13T19:53:26.830Z","logLevel":"info","context":"general","message":"Crawling done","details":{}}
{"timestamp":"2024-08-13T19:53:26.831Z","logLevel":"info","context":"general","message":"Exiting, Crawl status: done","details":{}}
54 changes: 50 additions & 4 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@ export class WACZ {
*/
cdxjDir = null

/**
* Path to directory of log files to copy into WACZ.
* @type {?string}
*/
logDir = null

/**
* @param {WACZOptions} options - See {@link WACZOptions} for details.
*/
Expand Down Expand Up @@ -294,15 +300,15 @@ export class WACZ {
this.indexFromWARCs = false
}

if (options?.pages) {
if (options?.pagesDir) {
this.detectPages = false
this.pagesDir = String(options?.pages).trim()
this.pagesDir = String(options?.pagesDir).trim()
}

if (options?.cdxj) {
if (options?.cdxjDir) {
this.detectPages = false
this.indexFromWARCs = false // Added here for clarity, but implied by calls to `this.addCDXJ()`
this.cdxjDir = String(options?.cdxj).trim()
this.cdxjDir = String(options?.cdxjDir).trim()
}

if (options?.url) {
Expand Down Expand Up @@ -340,6 +346,10 @@ export class WACZ {
}
}

if (options?.logDir) {
this.logDir = String(options?.logDir).trim()
}

if (options?.signingToken && this.signingUrl) {
this.signingToken = String(options.signingToken)
}
Expand Down Expand Up @@ -398,6 +408,11 @@ export class WACZ {
info('Writing WARCs to WACZ')
await this.writeWARCsToZip()

if (this.logDir) {
info('Writing logs to WACZ')
await this.writeLogsToZip()
}

info('Writing datapackage.json to WACZ')
await this.writeDatapackageToZip()

Expand Down Expand Up @@ -727,6 +742,37 @@ export class WACZ {
}
}

/**
* Streams all the files listed in `this.logDir` to the output ZIP.
* @returns {Promise<void>}
*/
writeLogsToZip = async () => {
this.stateCheck()

const { logDir, addFileToZip, log } = this

const allowedExts = ['log', 'txt']

const logFiles = await fs.readdir(logDir)

for (const logFile of logFiles) {
const logFilepath = resolve(this.logDir, logFile)

const ext = logFilepath.toLowerCase().split('.').pop()
if (!allowedExts.includes(ext)) {
log.warn(`Skipping log file ${logFile}, not in allowed extensions (txt, log).`)
continue
}

try {
await addFileToZip(logFilepath, `logs/${logFile}`)
} catch (err) {
log.trace(err)
throw new Error(`An error occurred while writing "${logFile}" to ZIP.`)
}
}
}

/**
* Creates `datapackage.json` out of `this.resources` and writes it to ZIP.
* @returns {Promise<void>}
Expand Down
26 changes: 19 additions & 7 deletions index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip'
import * as dotenv from 'dotenv'

import { WACZ } from './index.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH, LOG_DIR_FIXTURES_PATH, LOG_FILE_FIXTURE_PATH, CDXJ_DIR_FIXTURES_PATH } from './constants.js'
import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import

// Loads env vars from .env if provided
Expand Down Expand Up @@ -74,8 +74,8 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
assert.equal(archive.detectPages, false)
})

test('WACZ constructor accounts for options.pages if provided.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH })
test('WACZ constructor accounts for options.pagesDir if provided.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, pagesDir: PAGES_DIR_FIXTURES_PATH })
assert.equal(archive.detectPages, false)
assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH)
})
Expand Down Expand Up @@ -187,6 +187,11 @@ test('WACZ constructor accounts for options.datapackageExtras if provided.', asy
assert.equal(archive.datapackageExtras, datapackageExtras)
})

test('WACZ constructor accounts for options.logDir if valid.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, logDir: LOG_DIR_FIXTURES_PATH })
assert.equal(archive.logDir, LOG_DIR_FIXTURES_PATH)
})

test('addPage adds entry to pagesTree and turns detectPages off.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT })
assert.equal(archive.detectPages, true)
Expand Down Expand Up @@ -347,7 +352,8 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pages: PAGES_DIR_FIXTURES_PATH
pagesDir: PAGES_DIR_FIXTURES_PATH,
logDir: LOG_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)
Expand All @@ -356,10 +362,11 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f

const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line

// File in pages fixture directory that are invalid JSONL or have wrong extension
// Files in fixtures directories that are invalid JSONL or have wrong extensions
// should not be copied into the WACZ.
assert.rejects(async () => await zip.entryData('pages/invalid.jsonl'))
assert.rejects(async () => await zip.entryData('pages/invalid.txt'))
assert.rejects(async () => await zip.entryData('logs/invalid.md'))

// pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures
// they were copied from.
Expand All @@ -373,6 +380,11 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f
const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH)
assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash)

// log file provided in logDir option should have same hash as fixture
const datapackageLogFile = datapackage.resources.filter(entry => entry.path === 'logs/sample.log')[0]
const logFileFixtureHash = await archive.sha256(LOG_FILE_FIXTURE_PATH)
assert.equal(datapackageLogFile.hash, logFileFixtureHash)

// Delete temp file
await fs.unlink(options.output)
})
Expand All @@ -384,8 +396,8 @@ test('WACZ.process with cdxj option creates valid WACZ with index from provided
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pages: PAGES_DIR_FIXTURES_PATH,
cdxj: CDXJ_DIR_FIXTURES_PATH
pagesDir: PAGES_DIR_FIXTURES_PATH,
cdxjDir: CDXJ_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)
Expand Down
5 changes: 3 additions & 2 deletions types.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
* @property {?string} signingUrl - If set, will be used to try and sign the resulting archive.
* @property {?string} signingToken - Access token to be used in combination with `signingUrl`.
* @property {?Object} datapackageExtras - If set, will be appended to datapackage.json under `extras`.
* @property {?string} cdxj - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files.
* @property {?string} pages - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file.
* @property {?string} cdxjDir - If set, skips indexing and allows for passing CDXJ files "as is". Path to a folder containing CDXJ files.
* @property {?string} pagesDir - If set, allows for passing pre-set pages.jsonl file(s). Path to a folder containing pages files (pages.jsonl, extraPages.jsonl ...). Must be used in combination with `pages`, since using this option will skip the step required to generate a pages.jsonl file.
* @property {?string} logDir - If set, allows for passing existing crawler log files into the WACZ. Path to a folder containing log files. Only files with log or txt extensions in that folder will be copied.
* @property {?any} log - Will be used instead of the Console API for logging, if compatible (i.e: loglevel). Defaults to globalThis.console.
*/

Expand Down

0 comments on commit e7db309

Please sign in to comment.