Skip to content

Commit

Permalink
feat: async crawl in parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
indrajithi committed Aug 27, 2024
1 parent 0fc8d4a commit 15111f1
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
18 changes: 11 additions & 7 deletions src/datacrawl/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,14 +178,18 @@ async def start(self) -> Dict[str, Dict[str, List[str]]]:
await self.crawl(session, self.settings.root_url)

while self.link_count < self.settings.max_links and self.crawl_set:
url = self.crawl_set.pop() # Pop the URL from crawl_set
tasks = [self.crawl(session, url)]
await asyncio.gather(*tasks) # Use asyncio.gather to run the tasks
tasks: List[asyncio.Task] = []

while self.crawl_set and len(tasks) < self.settings.max_workers:
url = self.crawl_set.pop()
tasks.append(asyncio.create_task(self.crawl(session, url)))

await asyncio.gather(*tasks)

await asyncio.sleep(self.settings.delay)

if self.settings.save_to_file:
await self.save_results()
if self.settings.save_to_file:
await self.save_results()

logger.debug("Exiting....")
return self.crawl_result
logger.debug("Exiting....")
return self.crawl_result
2 changes: 2 additions & 0 deletions tests/core/test_crawller.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def crawler(crawl_settings: CrawlSettings) -> Datacrawl:

@pytest.mark.asyncio
async def test_crawl(crawler: Datacrawl) -> None:
crawler.settings.respect_robots_txt = False

with aioresponses() as m:
m.get(
root_url,
Expand Down

0 comments on commit 15111f1

Please sign in to comment.