add support for concurrent workers, custom delay and optional verbose

DataCrawl-AI · Jun 12, 2024 · 3c815be · 3c815be
1 parent e65ef69
commit 3c815be
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 21 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -18,9 +18,12 @@ jobs:
       - name: Install dependencies
         run: |
           poetry install
-      - name: Run pylint
+      - name: Run linter :pylint
         run: |
           poetry run pylint tiny_web_crawler
+      - name: Run mypy :type_checking
+        run: |
+          poetry run mypy tiny_web_crawler
 
   test:
     needs: lint

diff --git a/README.md b/README.md
@@ -6,10 +6,10 @@ A simple and efficient web crawler in Python.
 
 ## Features
 
-- Crawl web pages and extract links starting from a root URL and extract all the links found on each page
+- Crawl web pages and extract links starting from a root URL recursively
 - Handle relative and absolute URLs
-- Save the results of your crawl in a structured JSON format for easy analysis and processing
 - Designed with simplicity in mind, making it easy to use and extend for various web crawling tasks
+- Set concurrent workers and custom delay
 
 ## Installation
 
@@ -29,6 +29,13 @@ max_links = 2
 
 spider = Spider(root_url, max_links)
 spider.start()
+
+
+# set workers and delay (default: delay is 0.5 sec and verbose is True)
+
+crawl = Spider('https://github.com', 5, max_workers=5, delay=1, verbose=False)
+spider.start()
+
 ```
 
 

diff --git a/tiny_web_crawler/crawler.py b/tiny_web_crawler/crawler.py
@@ -2,15 +2,16 @@
 import json
 import urllib.parse
 from typing import Dict, List, Optional, Set
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
+import time
 import requests
 import validators
 from bs4 import BeautifulSoup
 from colorama import Fore, Style, init
 
 init(autoreset=True)
 
-
 DEFAULT_SCHEME: str = 'http://'
 
 
@@ -27,7 +28,13 @@ class Spider():
         save_to_file (Optional[str]): The file path to save the crawl results.
     """
 
-    def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str] = None) -> None:
+    def __init__(self,
+                 root_url: str,
+                 max_links: int = 5,
+                 save_to_file: Optional[str] = None,
+                 max_workers: int = 1,
+                 delay: float = 0.5,
+                 verbose: bool = True) -> None:
         """
         Initializes the Spider class.
 
@@ -43,6 +50,9 @@ def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str
         self.link_count: int = 0
         self.save_to_file: Optional[str] = save_to_file
         self.scheme: str = DEFAULT_SCHEME
+        self.max_workers: int = max_workers
+        self.delay: float = delay
+        self.verbose: bool = verbose
 
     def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
         """
@@ -54,7 +64,6 @@ def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
         Returns:
             Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully, None otherwise.
         """
-
         try:
             response = requests.get(url, timeout=10)
             response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
@@ -83,6 +92,10 @@ def is_valid_url(url: str) -> bool:
         """
         return bool(validators.url(url))
 
+    def verbose_print(self, content: str) -> None:
+        if self.verbose:
+            print(content)
+
     def save_results(self) -> None:
         """
         Saves the crawl results into a JSON file.
@@ -127,37 +140,38 @@ def crawl(self, url: str) -> None:
             url (str): The URL to crawl.
         """
         if not self.is_valid_url(url):
-            print(Fore.RED + f"Invalid url to crawl: {url}")
+            self.verbose_print(Fore.RED + f"Invalid url to crawl: {url}")
             return
 
         if url in self.crawl_result:
-            print(Fore.YELLOW + f"URL already crawled: {url}")
+            self.verbose_print(Fore.YELLOW + f"URL already crawled: {url}")
             return
 
-        print(Fore.GREEN + f"Crawling: {url}")
+        self.verbose_print(Fore.GREEN + f"Crawling: {url}")
         soup = self.fetch_url(url)
         if not soup:
             return
 
-        links = soup.body.find_all('a', href=True)
+        links = soup.body.find_all('a', href=True) if soup.body else []
         self.crawl_result[url] = {'urls': []}
 
         for link in links:
             pretty_url = self.format_url(link['href'].lstrip(), url)
             if not self.is_valid_url(pretty_url):
-                print(Fore.RED + f"Invalid url: {pretty_url}")
+                self.verbose_print(Fore.RED + f"Invalid url: {pretty_url}")
                 continue
 
             if pretty_url in self.crawl_result[url]['urls']:
                 continue
 
             self.crawl_result[url]['urls'].append(pretty_url)
             self.crawl_set.add(pretty_url)
-            print(Fore.BLUE + f"Link found: {pretty_url}")
+            self.verbose_print(Fore.BLUE + f"Link found: {pretty_url}")
 
         if self.link_count < self.max_links:
             self.link_count += 1
-            print(Fore.GREEN + f"Links crawled: {self.link_count}")
+            self.verbose_print(
+                Fore.GREEN + f"Links crawled: {self.link_count}")
 
     def start(self) -> Dict[str, Dict[str, List[str]]]:
         """
@@ -166,26 +180,36 @@ def start(self) -> Dict[str, Dict[str, List[str]]]:
         Returns:
             Dict[str, Dict[str, List[str]]]: The crawl results.
         """
-        self.crawl(self.root_url)
-
-        while self.crawl_set and self.link_count < self.max_links:
-            self.crawl(self.crawl_set.pop())
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            futures = {executor.submit(self.crawl, self.root_url)}
+
+            while self.link_count < self.max_links and futures:
+                for future in as_completed(futures):
+                    futures.remove(future)
+                    if future.exception() is None:
+                        while self.link_count < self.max_links and self.crawl_set:
+                            url = self.crawl_set.pop()
+                            if url not in self.crawl_result:
+                                futures.add(executor.submit(self.crawl, url))
+                                time.sleep(self.delay)
+                                break  # Break to check the next future
 
         if self.save_to_file:
             self.save_results()
-        print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
+        self.verbose_print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
         return self.crawl_result
 
 
 def main() -> None:
     """
     The main function to initialize and start the crawler.
     """
-    root_url = 'http://github.com'
-    max_links = 2
+    root_url = 'https://pypi.org/'
+    max_links = 5
 
     crawler = Spider(root_url, max_links, save_to_file='out.json')
-    crawler.start()
+    print(Fore.GREEN + f"Crawling: {root_url}")
+    print(crawler.start().keys())
 
 
 if __name__ == '__main__':