feat: async crawl in parallel

DataCrawl-AI · Aug 27, 2024 · 15111f1 · 15111f1
1 parent 0fc8d4a
commit 15111f1
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 7 deletions.
diff --git a/src/datacrawl/core/crawler.py b/src/datacrawl/core/crawler.py
@@ -178,14 +178,18 @@ async def start(self) -> Dict[str, Dict[str, List[str]]]:
             await self.crawl(session, self.settings.root_url)
 
             while self.link_count < self.settings.max_links and self.crawl_set:
-                url = self.crawl_set.pop()  # Pop the URL from crawl_set
-                tasks = [self.crawl(session, url)]
-                await asyncio.gather(*tasks)  # Use asyncio.gather to run the tasks
+                tasks: List[asyncio.Task] = []
+
+                while self.crawl_set and len(tasks) < self.settings.max_workers:
+                    url = self.crawl_set.pop()
+                    tasks.append(asyncio.create_task(self.crawl(session, url)))
+
+                await asyncio.gather(*tasks)
 
                 await asyncio.sleep(self.settings.delay)
 
-        if self.settings.save_to_file:
-            await self.save_results()
+            if self.settings.save_to_file:
+                await self.save_results()
 
-        logger.debug("Exiting....")
-        return self.crawl_result
+            logger.debug("Exiting....")
+            return self.crawl_result
diff --git a/tests/core/test_crawller.py b/tests/core/test_crawller.py
@@ -28,6 +28,8 @@ def crawler(crawl_settings: CrawlSettings) -> Datacrawl:
 
 @pytest.mark.asyncio
 async def test_crawl(crawler: Datacrawl) -> None:
+    crawler.settings.respect_robots_txt = False
+
     with aioresponses() as m:
         m.get(
             root_url,