Skip to content

Commit

Permalink
add support for concurrent workers, custom delay and optional verbose
Browse files Browse the repository at this point in the history
  • Loading branch information
indrajithi committed Jun 12, 2024
1 parent e65ef69 commit 3c815be
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 21 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@ jobs:
- name: Install dependencies
run: |
poetry install
- name: Run pylint
- name: Run linter :pylint
run: |
poetry run pylint tiny_web_crawler
- name: Run mypy :type_checking
run: |
poetry run mypy tiny_web_crawler
test:
needs: lint
Expand Down
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ A simple and efficient web crawler in Python.

## Features

- Crawl web pages and extract links starting from a root URL and extract all the links found on each page
- Crawl web pages and extract links starting from a root URL recursively
- Handle relative and absolute URLs
- Save the results of your crawl in a structured JSON format for easy analysis and processing
- Designed with simplicity in mind, making it easy to use and extend for various web crawling tasks
- Set concurrent workers and custom delay

## Installation

Expand All @@ -29,6 +29,13 @@ max_links = 2

spider = Spider(root_url, max_links)
spider.start()


# set workers and delay (default: delay is 0.5 sec and verbose is True)

crawl = Spider('https://github.com', 5, max_workers=5, delay=1, verbose=False)
spider.start()

```


Expand Down
60 changes: 42 additions & 18 deletions tiny_web_crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
import json
import urllib.parse
from typing import Dict, List, Optional, Set
from concurrent.futures import ThreadPoolExecutor, as_completed

import time
import requests
import validators
from bs4 import BeautifulSoup
from colorama import Fore, Style, init

init(autoreset=True)


DEFAULT_SCHEME: str = 'http://'


Expand All @@ -27,7 +28,13 @@ class Spider():
save_to_file (Optional[str]): The file path to save the crawl results.
"""

def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str] = None) -> None:
def __init__(self,
root_url: str,
max_links: int = 5,
save_to_file: Optional[str] = None,
max_workers: int = 1,
delay: float = 0.5,
verbose: bool = True) -> None:
"""
Initializes the Spider class.
Expand All @@ -43,6 +50,9 @@ def __init__(self, root_url: str, max_links: int = 5, save_to_file: Optional[str
self.link_count: int = 0
self.save_to_file: Optional[str] = save_to_file
self.scheme: str = DEFAULT_SCHEME
self.max_workers: int = max_workers
self.delay: float = delay
self.verbose: bool = verbose

def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
"""
Expand All @@ -54,7 +64,6 @@ def fetch_url(self, url: str) -> Optional[BeautifulSoup]:
Returns:
Optional[BeautifulSoup]: A BeautifulSoup object if the URL is fetched successfully, None otherwise.
"""

try:
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
Expand Down Expand Up @@ -83,6 +92,10 @@ def is_valid_url(url: str) -> bool:
"""
return bool(validators.url(url))

def verbose_print(self, content: str) -> None:
if self.verbose:
print(content)

def save_results(self) -> None:
"""
Saves the crawl results into a JSON file.
Expand Down Expand Up @@ -127,37 +140,38 @@ def crawl(self, url: str) -> None:
url (str): The URL to crawl.
"""
if not self.is_valid_url(url):
print(Fore.RED + f"Invalid url to crawl: {url}")
self.verbose_print(Fore.RED + f"Invalid url to crawl: {url}")
return

if url in self.crawl_result:
print(Fore.YELLOW + f"URL already crawled: {url}")
self.verbose_print(Fore.YELLOW + f"URL already crawled: {url}")
return

print(Fore.GREEN + f"Crawling: {url}")
self.verbose_print(Fore.GREEN + f"Crawling: {url}")
soup = self.fetch_url(url)
if not soup:
return

links = soup.body.find_all('a', href=True)
links = soup.body.find_all('a', href=True) if soup.body else []
self.crawl_result[url] = {'urls': []}

for link in links:
pretty_url = self.format_url(link['href'].lstrip(), url)
if not self.is_valid_url(pretty_url):
print(Fore.RED + f"Invalid url: {pretty_url}")
self.verbose_print(Fore.RED + f"Invalid url: {pretty_url}")
continue

if pretty_url in self.crawl_result[url]['urls']:
continue

self.crawl_result[url]['urls'].append(pretty_url)
self.crawl_set.add(pretty_url)
print(Fore.BLUE + f"Link found: {pretty_url}")
self.verbose_print(Fore.BLUE + f"Link found: {pretty_url}")

if self.link_count < self.max_links:
self.link_count += 1
print(Fore.GREEN + f"Links crawled: {self.link_count}")
self.verbose_print(
Fore.GREEN + f"Links crawled: {self.link_count}")

def start(self) -> Dict[str, Dict[str, List[str]]]:
"""
Expand All @@ -166,26 +180,36 @@ def start(self) -> Dict[str, Dict[str, List[str]]]:
Returns:
Dict[str, Dict[str, List[str]]]: The crawl results.
"""
self.crawl(self.root_url)

while self.crawl_set and self.link_count < self.max_links:
self.crawl(self.crawl_set.pop())
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = {executor.submit(self.crawl, self.root_url)}

while self.link_count < self.max_links and futures:
for future in as_completed(futures):
futures.remove(future)
if future.exception() is None:
while self.link_count < self.max_links and self.crawl_set:
url = self.crawl_set.pop()
if url not in self.crawl_result:
futures.add(executor.submit(self.crawl, url))
time.sleep(self.delay)
break # Break to check the next future

if self.save_to_file:
self.save_results()
print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
self.verbose_print(Style.BRIGHT + Fore.MAGENTA + "Exiting....")
return self.crawl_result


def main() -> None:
"""
The main function to initialize and start the crawler.
"""
root_url = 'http://github.com'
max_links = 2
root_url = 'https://pypi.org/'
max_links = 5

crawler = Spider(root_url, max_links, save_to_file='out.json')
crawler.start()
print(Fore.GREEN + f"Crawling: {root_url}")
print(crawler.start().keys())


if __name__ == '__main__':
Expand Down

0 comments on commit 3c815be

Please sign in to comment.