Skip to content

Commit

Permalink
Merge pull request #49 from Mews/transient-error-retry
Browse files Browse the repository at this point in the history
Feature: Retry mechanism for transient errors
  • Loading branch information
indrajithi authored Jul 3, 2024
2 parents d52dfa1 + 1fe33fa commit 8ed15c5
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 9 deletions.
2 changes: 1 addition & 1 deletion src/tiny_web_crawler/core/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def crawl(self, url: str) -> None:
return

logger.debug("Crawling: %s", url)
soup = fetch_url(url)
soup = fetch_url(url, retries=self.settings.max_retry_attempts)
if not soup:
return

Expand Down
1 change: 1 addition & 0 deletions src/tiny_web_crawler/core/spider_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class CrawlSettings:
internal_links_only: bool = False
external_links_only: bool = False
respect_robots_txt: bool = True
max_retry_attempts: int = 5

@dataclass
class SpiderSettings(GeneralSettings, CrawlSettings):
Expand Down
14 changes: 13 additions & 1 deletion src/tiny_web_crawler/networking/fetcher.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
from typing import Optional
import time

import requests
from bs4 import BeautifulSoup

from tiny_web_crawler.logging import get_logger

TRANSIENT_ERRORS = [408, 502, 503, 504]

logger = get_logger()

def fetch_url(url: str) -> Optional[BeautifulSoup]:
def is_transient_error(status_code: int) -> bool:
return status_code in TRANSIENT_ERRORS

def fetch_url(url: str, retries: int, attempts: int = 0) -> Optional[BeautifulSoup]:
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
data = response.text
return BeautifulSoup(data, 'lxml')
except requests.exceptions.HTTPError as http_err:
if response.status_code and is_transient_error(response.status_code) and retries > 0:
logger.error("Transient HTTP error occurred: %s. Retrying...", http_err)
time.sleep( attempts+1 )
return fetch_url( url, retries-1 , attempts+1)

logger.error("HTTP error occurred: %s", http_err)
return None
except requests.exceptions.ConnectionError as conn_err:
logger.error("Connection error occurred: %s", conn_err)
except requests.exceptions.Timeout as timeout_err:
Expand Down
98 changes: 92 additions & 6 deletions tests/networking/test_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from unittest.mock import patch

import responses
import requests
Expand All @@ -15,7 +16,7 @@ def test_fetch_url() -> None:
status=200
)

resp = fetch_url("http://example.com")
resp = fetch_url("http://example.com", 1)

assert resp is not None
assert resp.text == "link"
Expand All @@ -26,15 +27,15 @@ def test_fetch_url_connection_error(caplog) -> None: # type: ignore

with caplog.at_level(ERROR):
# Fetch url whose response isn't mocked to raise ConnectionError
resp = fetch_url("http://connection.error")
resp = fetch_url("http://connection.error", 1)

assert "Connection error occurred:" in caplog.text
assert resp is None


@responses.activate
def test_fetch_url_http_error(caplog) -> None: # type: ignore
error_codes = [403, 404, 408]
error_codes = [403, 404, 412]

for error_code in error_codes:
setup_mock_response(
Expand All @@ -44,7 +45,7 @@ def test_fetch_url_http_error(caplog) -> None: # type: ignore
)

with caplog.at_level(ERROR):
resp = fetch_url(f"http://http.error/{error_code}")
resp = fetch_url(f"http://http.error/{error_code}", 1)

assert "HTTP error occurred:" in caplog.text
assert resp is None
Expand All @@ -60,7 +61,7 @@ def test_fetch_url_timeout_error(caplog) -> None: # type: ignore

with caplog.at_level(ERROR):
# Fetch url whose response isn't mocked to raise ConnectionError
resp = fetch_url("http://timeout.error")
resp = fetch_url("http://timeout.error", 1)

assert "Timeout error occurred:" in caplog.text
assert resp is None
Expand All @@ -76,7 +77,92 @@ def test_fetch_url_requests_exception(caplog) -> None: # type: ignore

with caplog.at_level(ERROR):
# Fetch url whose response isn't mocked to raise ConnectionError
resp = fetch_url("http://requests.exception")
resp = fetch_url("http://requests.exception", 1)

assert "Request error occurred:" in caplog.text
assert resp is None


@patch("time.sleep")
@responses.activate
def test_fetch_url_transient_error_retry_5(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

max_retry_attempts = 5

with caplog.at_level(ERROR):
resp = fetch_url("http://transient.error", max_retry_attempts)

assert resp is None

# Assert url was fetched once then retried x ammount of times
assert len(responses.calls) == max_retry_attempts + 1

# Assert sleep time grew with every request
expected_delays = [1, 2, 3, 4, 5]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text


@patch("time.sleep")
@responses.activate
def test_fetch_url_transient_error_retry_10(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

max_retry_attempts = 10

with caplog.at_level(ERROR):
resp = fetch_url("http://transient.error", max_retry_attempts)

assert resp is None

# Assert url was fetched once then retried x ammount of times
assert len(responses.calls) == max_retry_attempts + 1

# Assert sleep time grew with every request
expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text


@patch("time.sleep")
@responses.activate
def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=200
)

max_retry_attempts = 1

with caplog.at_level(ERROR):
resp = fetch_url("http://transient.error", max_retry_attempts)

assert resp is not None
assert resp.text == "link"

# Assert url was fetched 2 times
assert len(responses.calls) == 2

# Assert time.sleep was called
mock_sleep.assert_called_once_with(1)

assert "Transient HTTP error occurred:" in caplog.text
59 changes: 58 additions & 1 deletion tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from tiny_web_crawler import Spider
from tiny_web_crawler import SpiderSettings
from tiny_web_crawler.logging import DEBUG, WARNING
from tiny_web_crawler.logging import DEBUG, WARNING, ERROR
from tests.utils import setup_mock_response

@responses.activate
Expand Down Expand Up @@ -490,3 +490,60 @@ def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> Non
def test_crawl_no_root_url() -> None:
with pytest.raises(ValueError):
Spider(SpiderSettings(verbose=False))


@patch("time.sleep")
@responses.activate
def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

spider = Spider(
SpiderSettings(root_url="http://transient.error",
respect_robots_txt=False)
)

with caplog.at_level(ERROR):
spider.crawl("http://transient.error")

assert spider.crawl_result == {}

assert len(responses.calls) == 6

expected_delays = [1, 2, 3, 4, 5]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text


@patch("time.sleep")
@responses.activate
def test_crawl_url_transient_retry_custom_retry_amount(mock_sleep, caplog) -> None: # type: ignore
setup_mock_response(
url="http://transient.error",
body="<html><body><a href='http://transient.error'>link</a></body></html>",
status=503
)

spider = Spider(
SpiderSettings(root_url="http://transient.error",
max_retry_attempts=10,
respect_robots_txt=False)
)

with caplog.at_level(ERROR):
spider.crawl("http://transient.error")

assert spider.crawl_result == {}

assert len(responses.calls) == 11

expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
assert actual_delays == expected_delays

assert "Transient HTTP error occurred:" in caplog.text

0 comments on commit 8ed15c5

Please sign in to comment.