Skip to content

Commit

Permalink
add queue improvements:
Browse files Browse the repository at this point in the history
- change error state to differntiate limit hit vs dupe url
- add QueueState enum to indicate success, limit hit, or dupe url
  • Loading branch information
ikreymer committed Sep 15, 2023
1 parent d458756 commit 57150c7
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 10 deletions.
24 changes: 17 additions & 7 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import fs from "fs";
import os from "os";
import fsp from "fs/promises";

import { RedisCrawlState, LoadState } from "./util/state.js";
import { RedisCrawlState, LoadState, QueueState } from "./util/state.js";
import Sitemapper from "sitemapper";
import { v4 as uuidv4 } from "uuid";
import yaml from "js-yaml";
Expand Down Expand Up @@ -1305,7 +1305,7 @@ self.__bx_behaviors.selectMainBehavior();
const {url, isOOS} = res;

if (url) {
await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops);
await this.queueUrl(seedId, url, depth, isOOS ? newExtraHops : extraHops, logDetails);
}
}
} catch (e) {
Expand Down Expand Up @@ -1333,19 +1333,29 @@ self.__bx_behaviors.selectMainBehavior();
}
}

async queueUrl(seedId, url, depth, extraHops = 0) {
async queueUrl(seedId, url, depth, extraHops, logDetails = {}) {
if (this.limitHit) {
return false;
}

if (!await this.crawlState.addToQueue({url, seedId, depth, extraHops}, this.pageLimit)) {
const result = await this.crawlState.addToQueue({url, seedId, depth, extraHops}, this.pageLimit);

switch (result) {
case QueueState.ADDED:
logger.debug("Queued new page url", {url, ...logDetails}, "links");
return true;

case QueueState.LIMIT_HIT:
logger.debug("Not queued page url, at page limit", {url, ...logDetails}, "links");
this.limitHit = true;
return false;
} else {
logger.debug(`Queued url ${url}`);

case QueueState.DUPE_URL:
logger.debug("Not queued page url, already seen", {url, ...logDetails}, "links");
return false;
}

return true;
return false;
}

async initPages() {
Expand Down
19 changes: 16 additions & 3 deletions util/state.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ export const LoadState = {
BEHAVIORS_DONE: 4,
};


// ============================================================================
export const QueueState = {
ADDED: 0,
LIMIT_HIT: 1,
DUPE_URL: 2,
};


// ============================================================================
export class PageState
{
Expand Down Expand Up @@ -70,14 +79,14 @@ export class RedisCrawlState
local size = redis.call('scard', KEYS[3]);
local limit = tonumber(ARGV[4]);
if limit > 0 and size >= limit then
return 0;
return 1;
end
if redis.call('sadd', KEYS[3], ARGV[1]) == 0 then
return 0;
return 2;
end
redis.call('zadd', KEYS[2], ARGV[2], ARGV[3]);
redis.call('hdel', KEYS[1], ARGV[1]);
return 1;
return 0;
`
});

Expand Down Expand Up @@ -248,6 +257,10 @@ return 0;
data.extraHops = extraHops;
}

// return codes
// 0 - url queued successfully
// 1 - url queue size limit reached
// 2 - url is a dupe
return await this.redis.addqueue(this.pkey, this.qkey, this.skey, url, this._getScore(data), JSON.stringify(data), limit);
}

Expand Down

0 comments on commit 57150c7

Please sign in to comment.