From 3dc4c6440da5482858561003fb5fe70787f43628 Mon Sep 17 00:00:00 2001
From: Indrajith Indraprastham <mail@indrajith.me>
Date: Thu, 13 Jun 2024 02:06:55 +0530
Subject: [PATCH] update package name

---
 crawler/crawler.py    | 200 ------------------------------------------
 tests/test_crawler.py |   2 +-
 2 files changed, 1 insertion(+), 201 deletions(-)
 delete mode 100644 crawler/crawler.py

diff --git a/crawler/crawler.py b/crawler/crawler.py
deleted file mode 100644
index 5bb114f..0000000
--- a/crawler/crawler.py
+++ /dev/null
@@ -1,200 +0,0 @@
-from __future__ import annotations
-
-import json
-import urllib.parse
-from typing import Dict, List, Optional, Set
-
-import requests
-import validators
-from bs4 import BeautifulSoup
-from colorama import Fore, Style, init
-
-# Initialize colorama
-init(autoreset=True)
-
-
-class SpiderConfig:
-    def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None:
-        self.root_url: str = root_url
-        self.max_links: int = max_links
-        self.default_scheme: str = 'http://'
-        self.save_to_file: Optional[str] = save_to_file
-        self.scheme: str = self.default_scheme
-
-
-class Spider:
-    """
-    A simple web crawler class.
-
-    Attributes:
-        root_url (str): The root URL to start crawling from.
-        max_links (int): The maximum number of links to crawl.
-        crawl_result (Dict[str, Dict[str, List[str]]]): The dictionary storing the crawl results.
-        crawl_set (Set[str]): A set of URLs to be crawled.
-        link_count (int): The current count of crawled links.
-        default_scheme (str): The default URL scheme (e.g., 'http://').
-        save_to_file (Optional[str]): The file path to save the crawl results.
-        scheme (str): The current URL scheme being used.
-    """
-
-    def __init__(self, root_url: str, max_links: int, save_to_file: Optional[str] = None) -> None:
-        """
-        Initializes the Spider class.
-
-        Args:
-            root_url (str): The root URL to start crawling from.
-            max_links (int): The maximum number of links to crawl.
-            save_to_file (Optional[str]): The file to save the crawl results to.
-        """
-        self.root_url: str = root_url
-        self.max_links: int = max_links
-        self.crawl_result: Dict[str, Dict[str, List[str]]] = {}
-        self.crawl_set: Set[str] = set()
-        self.link_count: int = 0
-        self.default_scheme: str = 'http://'
-        self.save_to_file: Optional[str] = save_to_file
-        self.scheme: str = self.default_scheme
-
-    @staticmethod
-    def is_valid_url(url: str) -> bool:
-        """
-        Returns True for a valid url, False for an invalid url.
-        """
-        return bool(validators.url(url))
-
-    def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
-        """
-        Reads the content of a URL and parses it using BeautifulSoup with lxml parser.
-
-        Args:
-            url (str): The URL to fetch and parse.
-
-        Returns:
-            Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully,
-            None otherwise.
-        """
-        try:
-            response = requests.get(url, timeout=10)
-            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
-            data = response.text
-            return BeautifulSoup(data, 'lxml')
-        except requests.exceptions.HTTPError as http_err:
-            print(Fore.RED + f"HTTP error occurred: {http_err}")
-        except requests.exceptions.ConnectionError as conn_err:
-            print(Fore.RED + f"Connection error occurred: {conn_err}")
-        except requests.exceptions.Timeout as timeout_err:
-            print(Fore.RED + f"Timeout error occurred: {timeout_err}")
-        except requests.exceptions.RequestException as req_err:
-            print(Fore.RED + f"Request error occurred: {req_err}")
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            # Catch all other exceptions
-            print(Fore.RED + f"An unexpected error occurred: {e}")
-        return None
-
-    def save_results(self) -> None:
-        """
-        Saves the crawl results into a JSON file.
-        """
-        if self.save_to_file:
-            with open(self.save_to_file, 'w', encoding='utf-8') as file:
-                json.dump(self.crawl_result, file, indent=4)
-
-    def format_url(self, url: str, base_url: str) -> str:
-        """
-        Formats a URL to ensure it is absolute and removes any query parameters or fragments.
-
-        Args:
-            url (str): The URL to format.
-            base_url (str): The base URL to resolve relative URLs.
-
-        Returns:
-            str: The formatted URL.
-        """
-        parsed_url = urllib.parse.urlparse(url)
-        base_url = base_url.rstrip('/')
-
-        if parsed_url.scheme:
-            self.scheme = parsed_url.scheme
-
-        if not parsed_url.scheme and not parsed_url.netloc:
-            if self.is_valid_url(self.default_scheme + parsed_url.path):
-                return self.default_scheme + parsed_url.path
-
-            if parsed_url.path.startswith('/'):
-                return base_url + parsed_url.path
-            else:
-                return f"{base_url}/{parsed_url.path}"
-
-        return f"{self.scheme}://{parsed_url.netloc}{parsed_url.path}"
-
-    def crawl(self, url: str) -> None:
-        """
-        Crawls a given URL, extracts links, and adds them to the crawl results.
-
-        Args:
-            url (str): The URL to crawl.
-        """
-        if not self.is_valid_url(url):
-            print(Fore.RED + f"Invalid url to crawl: {url}")
-            return
-
-        if url in self.crawl_result:
-            print(Fore.YELLOW + f"URL already crawled: {url}")
-            return
-
-        print(Fore.GREEN + f"Crawling: {url}")
-        soup = self.fetch_url(url)
-        if not soup:
-            return
-
-        links = soup.body.find_all('a', href=True)
-        self.crawl_result[url] = {'urls': []}
-
-        for link in links:
-            pretty_url = self.format_url(link['href'].lstrip(), url)
-            if not self.is_valid_url(pretty_url):
-                print(Fore.RED + f"Invalid url: {pretty_url}")
-                continue
-
-            if pretty_url in self.crawl_result[url]['urls']:
-                continue
-
-            self.crawl_result[url]['urls'].append(pretty_url)
-            self.crawl_set.add(pretty_url)
-            print(Fore.BLUE + f"Link found: {pretty_url}")
-
-        if self.link_count < self.max_links:
-            self.link_count += 1
-            print(Fore.GREEN + f"Links crawled: {self.link_count}")
-
-    def start(self) -> Dict[str, Dict[str, List[str]]]:
-        """
-        Starts the crawling process from the root URL. Crawls up to max_links URLs.
-
-        Returns:
-            Dict[str, Dict[str, List[str]]]: The crawl results.
-        """
-        self.crawl(self.root_url)
-
-        while self.crawl_set and self.link_count < self.max_links:
-            self.crawl(self.crawl_set.pop())
-
-        if self.save_to_file:
-            self.save_results()
-        print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
-        return self.crawl_result
-
-
-def main() -> None:
-    """
-    The main function to initialize and start the crawler.
-    """
-    root_url = 'http://github.com'
-    max_links = 2
-
-    crawler = Spider(root_url, max_links, save_to_file='out.json')
-    crawler.start()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
index 1578369..a48d8f6 100644
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -2,7 +2,7 @@
 
 import responses
 
-from crawler.crawler import Spider
+from tiny_web_crawler.crawler import Spider
 
 
 def test_is_valid_url() -> None: