From 45380b3ee59d5707651f362e1ba98c55fedb53de Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Mon, 24 Apr 2023 20:40:13 +0200 Subject: [PATCH 1/4] feat(cheerio): expose transformRequestFunction as hidden input --- .../cheerio-scraper/src/internals/consts.ts | 1 + .../src/internals/crawler_setup.ts | 31 ++++++++++++------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts b/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts index f324876790..b406d5bfc4 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts @@ -9,6 +9,7 @@ export interface Input { globs: GlobInput[]; regexps: RegExpInput[]; excludes: GlobInput[]; + transformRequestFunction?: string; pseudoUrls: PseudoUrlInput[]; keepUrlFragments: boolean; linkSelector?: string; diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts index 8061759e48..16b3c0d644 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts @@ -19,6 +19,8 @@ import { log, Dictionary, Awaitable, + RequestOptions, + RequestTransform, } from '@crawlee/cheerio'; import { Actor, ApifyEnv } from 'apify'; import { load } from 'cheerio'; @@ -54,6 +56,7 @@ export class CrawlerSetup implements CrawlerSetupOptions { evaledPageFunction: (...args: unknown[]) => unknown; evaledPreNavigationHooks: ((...args: unknown[]) => Awaitable)[]; evaledPostNavigationHooks: ((...args: unknown[]) => Awaitable)[]; + evaledTransformRequestFunction?: RequestTransform; datasetName?: string; keyValueStoreName?: string; requestQueueName?: string; @@ -97,6 +100,10 @@ export class CrawlerSetup implements CrawlerSetupOptions { // Functions need to be evaluated. this.evaledPageFunction = tools.evalFunctionOrThrow(this.input.pageFunction); + if (this.input.transformRequestFunction) { + this.evaledTransformRequestFunction = tools.evalFunctionOrThrow(this.input.transformRequestFunction); + } + if (this.input.preNavigationHooks) { this.evaledPreNavigationHooks = tools.evalFunctionArrayOrThrow(this.input.preNavigationHooks, 'preNavigationHooks'); } else { @@ -335,22 +342,24 @@ export class CrawlerSetup implements CrawlerSetupOptions { return; } + const baseTransformRequestFunction = (requestOptions: RequestOptions) => { + requestOptions.userData ??= {}; + requestOptions.userData[META_KEY] = { + parentRequestId: request.id || request.uniqueKey, + depth: currentDepth + 1, + }; + + requestOptions.useExtendedUniqueKey = true; + requestOptions.keepUrlFragment = this.input.keepUrlFragments; + return requestOptions; + } + await enqueueLinks({ selector: this.input.linkSelector, pseudoUrls: this.input.pseudoUrls, globs: this.input.globs, exclude: this.input.excludes, - transformRequestFunction: (requestOptions) => { - requestOptions.userData ??= {}; - requestOptions.userData[META_KEY] = { - parentRequestId: request.id || request.uniqueKey, - depth: currentDepth + 1, - }; - - requestOptions.useExtendedUniqueKey = true; - requestOptions.keepUrlFragment = this.input.keepUrlFragments; - return requestOptions; - }, + transformRequestFunction: this.evaledTransformRequestFunction || baseTransformRequestFunction, }); } From 8ea92ba8e9f04670187be9ce0db224236153234d Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Mon, 24 Apr 2023 20:45:03 +0200 Subject: [PATCH 2/4] fix: force RequestTransform type from eval --- .../cheerio-scraper/src/internals/crawler_setup.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts index 16b3c0d644..c6af29e74c 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts @@ -101,7 +101,7 @@ export class CrawlerSetup implements CrawlerSetupOptions { this.evaledPageFunction = tools.evalFunctionOrThrow(this.input.pageFunction); if (this.input.transformRequestFunction) { - this.evaledTransformRequestFunction = tools.evalFunctionOrThrow(this.input.transformRequestFunction); + this.evaledTransformRequestFunction = tools.evalFunctionOrThrow(this.input.transformRequestFunction) as RequestTransform; } if (this.input.preNavigationHooks) { From dfa51ed37c01473ad08cb57d793bb0430cda3b0a Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Mon, 24 Apr 2023 22:02:11 +0200 Subject: [PATCH 3/4] fix(cheerio): run baseTransformRequest before user transformRequest --- .../src/internals/crawler_setup.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts index c6af29e74c..bc9f51d636 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts @@ -354,12 +354,25 @@ export class CrawlerSetup implements CrawlerSetupOptions { return requestOptions; } + let transformRequestFunction: RequestTransform; + + if (this.evaledTransformRequestFunction) { + transformRequestFunction = (requestOptions: RequestOptions) => { + const updatedOptions = this.evaledTransformRequestFunction!(requestOptions); + if (updatedOptions) { + return baseTransformRequestFunction(requestOptions); + } + } + } else { + transformRequestFunction = baseTransformRequestFunction; + } + await enqueueLinks({ selector: this.input.linkSelector, pseudoUrls: this.input.pseudoUrls, globs: this.input.globs, exclude: this.input.excludes, - transformRequestFunction: this.evaledTransformRequestFunction || baseTransformRequestFunction, + transformRequestFunction, }); } From 14748d07612acf387f7a6d115e6346b9d693a1a5 Mon Sep 17 00:00:00 2001 From: metalwarrior665 Date: Mon, 24 Apr 2023 22:03:52 +0200 Subject: [PATCH 4/4] fix(cheerio): wrong type --- .../actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts index bc9f51d636..9c56c1420b 100644 --- a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts +++ b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts @@ -362,6 +362,7 @@ export class CrawlerSetup implements CrawlerSetupOptions { if (updatedOptions) { return baseTransformRequestFunction(requestOptions); } + return updatedOptions; } } else { transformRequestFunction = baseTransformRequestFunction;