diff --git a/Makefile b/Makefile index 2e34e4d..25e8686 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ init: - pip3 install -e .[dev] + pip3 install -e .[dev,async] test: pytest -p no:cacheprovider diff --git a/README.md b/README.md index 85857af..6ea71fd 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # ScrapingAnt API client for Python + [![PyPI version](https://badge.fury.io/py/scrapingant-client.svg)](https://badge.fury.io/py/scrapingant-client) -`scrapingant-client` is the official library to access [ScrapingAnt API](https://docs.scrapingant.com) from your -Python applications. It provides useful features like parameters encoding to improve the ScrapingAnt usage experience. -Requires python 3.6+. +`scrapingant-client` is the official library to access [ScrapingAnt API](https://docs.scrapingant.com) from your Python +applications. It provides useful features like parameters encoding to improve the ScrapingAnt usage experience. Requires +python 3.6+. @@ -17,6 +18,7 @@ Requires python 3.6+. ## Quick Start + ```python3 from scrapingant_client import ScrapingAntClient @@ -26,15 +28,29 @@ result = client.general_request('https://example.com') print(result.content) ``` +## Install + +```shell +pip install scrapingant-client +``` + +If you need async support: + +```shell +pip install scrapingant-client[async] +``` + ## API token + In order to get API token you'll need to register at [ScrapingAnt Service](https://app.scrapingant.com) ## API Reference + All public classes, methods and their parameters can be inspected in this API reference. #### ScrapingAntClient(token) -Main class of this library. +Main class of this library. | Param | Type | | --- | --- | @@ -42,7 +58,7 @@ Main class of this library. * * * -#### ScrapingAntClient.general_request +#### ScrapingAntClient.general_request and ScrapingAntClient.general_request_async https://docs.scrapingant.com/request-response-format#available-parameters @@ -63,6 +79,7 @@ https://docs.scrapingant.com/request-response-format#available-parameters * * * #### Cookie + Class defining cookie. Currently it supports only name and value | Param | Type | @@ -73,7 +90,8 @@ Class defining cookie. Currently it supports only name and value * * * #### Response -Class defining response from API. + +Class defining response from API. | Param | Type | | --- | --- | @@ -83,11 +101,11 @@ Class defining response from API. ## Exceptions -`ScrapingantClientException` is base Exception class, used for all errors. +`ScrapingantClientException` is base Exception class, used for all errors. | Exception | Reason | | --- | --- | -| ScrapingantInvalidTokenException | The API token is wrong or you have exceeded the API calls request limit +| ScrapingantInvalidTokenException | The API token is wrong or you have exceeded the API calls request limit | ScrapingantInvalidInputException | Invalid value provided. Please, look into error message for more info | | ScrapingantInternalException | Something went wrong with the server side code. Try again later or contact ScrapingAnt support | | ScrapingantSiteNotReachableException | The requested URL is not reachable. Please, check it locally | @@ -106,7 +124,7 @@ from scrapingant_client import Cookie client = ScrapingAntClient(token='') result = client.general_request( - 'https://httpbin.org/cookies', + 'https://httpbin.org/cookies', cookies=[ Cookie(name='cookieName1', value='cookieVal1'), Cookie(name='cookieName2', value='cookieVal2'), @@ -122,6 +140,7 @@ response_cookies = result.cookies ```python from scrapingant_client import ScrapingAntClient + client = ScrapingAntClient(token='') customJsSnippet = """ @@ -130,7 +149,7 @@ var htmlElement = document.getElementsByTagName('html')[0]; htmlElement.innerHTML = str; """ result = client.general_request( - 'https://example.com', + 'https://example.com', js_snippet=customJsSnippet, ) print(result.content) @@ -145,14 +164,16 @@ client = ScrapingAntClient(token='') RETRIES_COUNT = 3 + def parse_html(html: str): ... # Implement your data extraction here + parsed_data = None for retry_number in range(RETRIES_COUNT): try: scrapingant_response = client.general_request( - 'https://example.com', + 'https://example.com', ) except ScrapingantInvalidInputException as e: print(f'Got invalid input exception: {{repr(e)}}') @@ -167,7 +188,6 @@ for retry_number in range(RETRIES_COUNT): break # Data is parsed successfully, so we dont need to retry except Exception as e: print(f'Got exception while parsing data {repr(e)}') - if parsed_data is None: print(f'Failed to retrieve and parse data after {RETRIES_COUNT} tries') @@ -184,7 +204,7 @@ from scrapingant_client import ScrapingAntClient client = ScrapingAntClient(token='') result = client.general_request( - 'https://httpbin.org/headers', + 'https://httpbin.org/headers', headers={ 'test-header': 'test-value' } @@ -193,13 +213,32 @@ print(result.content) # Http basic auth example result = client.general_request( - 'https://jigsaw.w3.org/HTTP/Basic/', + 'https://jigsaw.w3.org/HTTP/Basic/', headers={'Authorization': 'Basic Z3Vlc3Q6Z3Vlc3Q='} ) print(result.content) ``` +### Simple async example + +```python3 +import asyncio + +from scrapingant_client import ScrapingAntClient + +client = ScrapingAntClient(token='') + + +async def main(): + # Scrape the example.com site. + result = await client.general_request_async('https://example.com') + print(result.content) + + +asyncio.run(main()) +``` ## Useful links + - [Scrapingant API doumentation](https://docs.scrapingant.com) - [Scrapingant JS Client](https://github.com/scrapingant/scrapingant-client-js) diff --git a/scrapingant_client/__init__.py b/scrapingant_client/__init__.py index ca457a3..1152584 100644 --- a/scrapingant_client/__init__.py +++ b/scrapingant_client/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.3.9" +__version__ = "1.0.0" from scrapingant_client.client import ScrapingAntClient from scrapingant_client.cookie import Cookie diff --git a/scrapingant_client/client.py b/scrapingant_client/client.py index 2c03ecc..949f773 100644 --- a/scrapingant_client/client.py +++ b/scrapingant_client/client.py @@ -25,24 +25,23 @@ def __init__(self, token: str): self.token = token self.requests_session = requests.Session() version = scrapingant_client.__version__ - user_agent = f'ScrapingAnt Client/{version} ({sys.platform}; Python/{platform.python_version()});' + self.user_agent = f'ScrapingAnt Client/{version} ({sys.platform}; Python/{platform.python_version()});' self.requests_session.headers.update({ 'x-api-key': self.token, - 'User-Agent': user_agent, + 'User-Agent': self.user_agent, }) - def general_request( + def _form_payload( self, url: str, cookies: Optional[List[Cookie]] = None, - headers: Optional[Dict[str, str]] = None, js_snippet: Optional[str] = None, proxy_type: ProxyType = ProxyType.datacenter, proxy_country: Optional[str] = None, return_text: bool = False, wait_for_selector: Optional[str] = None, browser: bool = True, - ) -> Response: + ) -> Dict: request_data = {'url': url} if cookies is not None: request_data['cookies'] = cookies_list_to_string(cookies) @@ -56,29 +55,97 @@ def general_request( request_data['wait_for_selector'] = wait_for_selector request_data['return_text'] = return_text request_data['browser'] = browser + return request_data - response = self.requests_session.post( - SCRAPINGANT_API_BASE_URL + '/general', - json=request_data, - headers=convert_headers(headers), - ) - if response.status_code == 403: + def _parse_response(self, response_status_code: int, response_data: Dict, url: str) -> Response: + if response_status_code == 403: raise ScrapingantInvalidTokenException() - elif response.status_code == 404: + elif response_status_code == 404: raise ScrapingantSiteNotReachableException(url) - elif response.status_code == 422: - raise ScrapingantInvalidInputException(response.text) - elif response.status_code == 423: + elif response_status_code == 422: + raise ScrapingantInvalidInputException(response_data) + elif response_status_code == 423: raise ScrapingantDetectedException() - elif response.status_code == 500: + elif response_status_code == 500: raise ScrapingantInternalException() - json_response = response.json() - content = json_response['content'] - cookies_string = json_response['cookies'] - status_code = json_response['status_code'] + content = response_data['content'] + cookies_string = response_data['cookies'] + status_code = response_data['status_code'] cookies_list = cookies_list_from_string(cookies_string) return Response( content=content, cookies=cookies_list, status_code=status_code ) + + def general_request( + self, + url: str, + cookies: Optional[List[Cookie]] = None, + headers: Optional[Dict[str, str]] = None, + js_snippet: Optional[str] = None, + proxy_type: ProxyType = ProxyType.datacenter, + proxy_country: Optional[str] = None, + return_text: bool = False, + wait_for_selector: Optional[str] = None, + browser: bool = True, + ) -> Response: + request_data = self._form_payload( + url=url, + cookies=cookies, + js_snippet=js_snippet, + proxy_type=proxy_type, + proxy_country=proxy_country, + return_text=return_text, + wait_for_selector=wait_for_selector, + browser=browser, + ) + response = self.requests_session.post( + SCRAPINGANT_API_BASE_URL + '/general', + json=request_data, + headers=convert_headers(headers), + ) + response_status_code = response.status_code + response_data = response.json() + parsed_response: Response = self._parse_response(response_status_code, response_data, url) + return parsed_response + + async def general_request_async( + self, + url: str, + cookies: Optional[List[Cookie]] = None, + headers: Optional[Dict[str, str]] = None, + js_snippet: Optional[str] = None, + proxy_type: ProxyType = ProxyType.datacenter, + proxy_country: Optional[str] = None, + return_text: bool = False, + wait_for_selector: Optional[str] = None, + browser: bool = True, + ) -> Response: + import httpx + + request_data = self._form_payload( + url=url, + cookies=cookies, + js_snippet=js_snippet, + proxy_type=proxy_type, + proxy_country=proxy_country, + return_text=return_text, + wait_for_selector=wait_for_selector, + browser=browser, + ) + async with httpx.AsyncClient( + headers={ + 'x-api-key': self.token, + 'User-Agent': self.user_agent, + } + ) as client: + response = await client.post( + SCRAPINGANT_API_BASE_URL + '/general', + json=request_data, + headers=convert_headers(headers), + ) + response_status_code = response.status_code + response_data = response.json() + parsed_response: Response = self._parse_response(response_status_code, response_data, url) + return parsed_response diff --git a/setup.py b/setup.py index a580816..030613e 100644 --- a/setup.py +++ b/setup.py @@ -38,9 +38,14 @@ install_requires=['requests>=2,<3'], extras_require={ 'dev': [ - 'pytest>=6,<7', - 'flake8>=3,<4', - 'responses>=0,<1' - ] + 'pytest>=7,<8', + 'flake8>=4,<5', + 'responses>=0,<1', + 'pytest-httpx>=0,<1', + 'pytest-asyncio>=0,<1', + ], + 'async': [ + 'httpx<1', + ], }, ) diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index 26d886f..a342b9f 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -28,7 +28,7 @@ def test_invalid_input(): client = ScrapingAntClient(token='some_token') with pytest.raises(ScrapingantInvalidInputException) as e: client.general_request('bad_url') - assert '{"detail": "wrong url"}' in str(e) + assert 'wrong url' in str(e) @responses.activate diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..4179521 --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,106 @@ +import json + +import pytest +import responses +from pytest_httpx import HTTPXMock + +from scrapingant_client import ScrapingAntClient, Cookie, ProxyType +from scrapingant_client.constants import SCRAPINGANT_API_BASE_URL + + +@responses.activate +def test_integration(): + client = ScrapingAntClient(token='test_token') + responses.add( + responses.POST, + url=SCRAPINGANT_API_BASE_URL + '/general', + json={ + "content": "test_content", + "cookies": "test_key1=test_value1;test_key2=test_value2", + "status_code": 200, + }, + status=200, + ) + response = client.general_request( + url='http://example.com', + cookies=[Cookie('test_name', 'test_value')], + headers={'testheader': 'test_header_value'}, + js_snippet='test_js_string', + proxy_type=ProxyType.datacenter, + proxy_country='test_country', + return_text=True, + wait_for_selector='test_selector', + browser=True, + ) + expected = { + 'content': 'test_content', + 'cookies': [Cookie('test_key1', 'test_value1'), Cookie('test_key2', 'test_value2')], + 'status_code': 200 + } + assert response.__dict__ == expected + assert len(responses.calls) == 1 + + expected_body = { + 'browser': True, + 'cookies': 'test_name=test_value', + 'js_snippet': 'dGVzdF9qc19zdHJpbmc=', + 'proxy_country': 'test_country', + 'proxy_type': 'datacenter', + 'return_text': True, + 'url': 'http://example.com', + 'wait_for_selector': 'test_selector', + } + assert json.loads(responses.calls[0].request.body) == expected_body + + headers = responses.calls[0].request.headers + assert headers['ant-testheader'] == 'test_header_value' + assert headers['x-api-key'] == 'test_token' + + +@pytest.mark.asyncio +async def test_integration_async(httpx_mock: HTTPXMock): + client = ScrapingAntClient(token='test_token') + httpx_mock.add_response( + method="POST", + url=SCRAPINGANT_API_BASE_URL + '/general', + json={ + "content": "test_content", + "cookies": "test_key1=test_value1;test_key2=test_value2", + "status_code": 200, + }, + status_code=200, + ) + response = await client.general_request_async( + url='http://example.com', + cookies=[Cookie('test_name', 'test_value')], + headers={'testheader': 'test_header_value'}, + js_snippet='test_js_string', + proxy_type=ProxyType.datacenter, + proxy_country='test_country', + return_text=True, + wait_for_selector='test_selector', + browser=True, + ) + expected = { + 'content': 'test_content', + 'cookies': [Cookie('test_key1', 'test_value1'), Cookie('test_key2', 'test_value2')], + 'status_code': 200 + } + assert response.__dict__ == expected + assert len(httpx_mock.get_requests()) == 1 + + expected_body = { + 'browser': True, + 'cookies': 'test_name=test_value', + 'js_snippet': 'dGVzdF9qc19zdHJpbmc=', + 'proxy_country': 'test_country', + 'proxy_type': 'datacenter', + 'return_text': True, + 'url': 'http://example.com', + 'wait_for_selector': 'test_selector', + } + assert json.loads(httpx_mock.get_requests()[0].content) == expected_body + + headers = httpx_mock.get_requests()[0].headers + assert headers['ant-testheader'] == 'test_header_value' + assert headers['x-api-key'] == 'test_token'