diff --git a/Dockerfile b/Dockerfile index f5ef85b08..8abc7c530 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,6 +47,9 @@ RUN ln -s /app/main.js /usr/bin/crawl; ln -s /app/create-login-profile.js /usr/b WORKDIR /crawls +# enable to test custom behaviors build (from browsertrix-behaviors) +# COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js + ADD docker-entrypoint.sh /docker-entrypoint.sh ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/crawler.js b/crawler.js index af00c31cb..8d747ae1b 100644 --- a/crawler.js +++ b/crawler.js @@ -607,11 +607,11 @@ self.__bx_behaviors.selectMainBehavior(); const frameUrl = frame.url(); - const frameElem = await frame.frameElement(); + // this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs + // if there's no tag or an iframe tag, then assume its a regular frame + const tagName = await frame.evaluate("window.frameElement && window.frameElement.tagName"); - const tagName = await frame.evaluate(e => e.tagName, frameElem); - - if (tagName !== "IFRAME" && tagName !== "FRAME") { + if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") { logger.debug("Skipping processing non-frame object", {tagName, frameUrl, ...logDetails}, "behavior"); return null; }