Skip to content

Commit

Permalink
update crawler_worker to index urls when they've been downloaded
Browse files Browse the repository at this point in the history
  • Loading branch information
nattvara committed Mar 25, 2024
1 parent df5cd9f commit 773b07a
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions jobs/crawl/start_crawler_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from services.download.download import DownloadService
from cache.mutex import LockAlreadyAcquiredException
from services.crawler.crawler import CrawlerService
from services.index.index import IndexService
from config.logger import log
from db.models import Url
import cache.redis
Expand All @@ -22,6 +23,10 @@ def get_download_service(browser: Browser, context: BrowserContext, page: Page)
return DownloadService(browser, context, page)


def get_index_service() -> IndexService:
return IndexService()


def start_job_again_in(seconds: int):
from jobs.schedule import schedule_job_start_crawler_worker
schedule_job_start_crawler_worker(seconds)
Expand All @@ -34,6 +39,7 @@ async def run_worker():
redis = await cache.redis.get_redis_connection()
crawler_service = get_crawler_service(redis, browser, context, page)
download_service = get_download_service(browser, context, page)
index_service = get_index_service()

while crawler_service.has_next():
try:
Expand All @@ -43,6 +49,13 @@ async def run_worker():
url.refresh()
if url.state == Url.States.VISITED:
await download_service.save_url_content(url)
url.refresh()
if not url.content_is_duplicate:
if url.response_was_ok:
index_service.index_url(url)
elif url.is_download:
index_service.index_url(url)

except LockAlreadyAcquiredException:
log().debug("Found a lock on the url. Skipping.")

Expand Down

0 comments on commit 773b07a

Please sign in to comment.