From b2cf88ffea8d75808c9210850a03fcc70b0b9e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Mon, 11 Nov 2024 12:57:53 +0100 Subject: [PATCH] feat: Add BeautifulSoupParser type alias (#674) To avoid repeating same Literal definitions. --- src/crawlee/beautifulsoup_crawler/__init__.py | 4 ++-- src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py index 9fa733cc13..58a8e98deb 100644 --- a/src/crawlee/beautifulsoup_crawler/__init__.py +++ b/src/crawlee/beautifulsoup_crawler/__init__.py @@ -1,5 +1,5 @@ try: - from ._beautifulsoup_crawler import BeautifulSoupCrawler + from ._beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupParser from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext except ImportError as exc: raise ImportError( @@ -7,4 +7,4 @@ "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.", ) from exc -__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext'] +__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParser'] diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py index 551bbbd87e..43c7959b43 100644 --- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py @@ -21,6 +21,8 @@ if TYPE_CHECKING: from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs +BeautifulSoupParser = Literal['html.parser', 'lxml', 'xml', 'html5lib'] + class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]): """A web crawler for performing HTTP requests and parsing HTML/XML content. @@ -61,7 +63,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: def __init__( self, *, - parser: Literal['html.parser', 'lxml', 'xml', 'html5lib'] = 'lxml', + parser: BeautifulSoupParser = 'lxml', additional_http_error_status_codes: Iterable[int] = (), ignore_http_error_status_codes: Iterable[int] = (), **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],