Skip to content

Commit

Permalink
Merge pull request #27 from ScrapingAnt/feature/add-async-client-support
Browse files Browse the repository at this point in the history
feature/add-async-client-support: done
  • Loading branch information
megabotan authored Jun 30, 2022
2 parents 10694a2 + c8644f7 commit 05f3948
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 41 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
init:
pip3 install -e .[dev]
pip3 install -e .[dev,async]

test:
pytest -p no:cacheprovider
Expand Down
67 changes: 53 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# ScrapingAnt API client for Python

[![PyPI version](https://badge.fury.io/py/scrapingant-client.svg)](https://badge.fury.io/py/scrapingant-client)

`scrapingant-client` is the official library to access [ScrapingAnt API](https://docs.scrapingant.com) from your
Python applications. It provides useful features like parameters encoding to improve the ScrapingAnt usage experience.
Requires python 3.6+.
`scrapingant-client` is the official library to access [ScrapingAnt API](https://docs.scrapingant.com) from your Python
applications. It provides useful features like parameters encoding to improve the ScrapingAnt usage experience. Requires
python 3.6+.

<!-- toc -->

Expand All @@ -17,6 +18,7 @@ Requires python 3.6+.
<!-- tocstop -->

## Quick Start

```python3
from scrapingant_client import ScrapingAntClient

Expand All @@ -26,23 +28,37 @@ result = client.general_request('https://example.com')
print(result.content)
```

## Install

```shell
pip install scrapingant-client
```

If you need async support:

```shell
pip install scrapingant-client[async]
```

## API token

In order to get API token you'll need to register at [ScrapingAnt Service](https://app.scrapingant.com)

## API Reference

All public classes, methods and their parameters can be inspected in this API reference.

#### ScrapingAntClient(token)

Main class of this library.
Main class of this library.

| Param | Type |
| --- | --- |
| token | <code>string</code> |

* * *

#### ScrapingAntClient.general_request
#### ScrapingAntClient.general_request and ScrapingAntClient.general_request_async

https://docs.scrapingant.com/request-response-format#available-parameters

Expand All @@ -63,6 +79,7 @@ https://docs.scrapingant.com/request-response-format#available-parameters
* * *

#### Cookie

Class defining cookie. Currently it supports only name and value

| Param | Type |
Expand All @@ -73,7 +90,8 @@ Class defining cookie. Currently it supports only name and value
* * *

#### Response
Class defining response from API.

Class defining response from API.

| Param | Type |
| --- | --- |
Expand All @@ -83,11 +101,11 @@ Class defining response from API.

## Exceptions

`ScrapingantClientException` is base Exception class, used for all errors.
`ScrapingantClientException` is base Exception class, used for all errors.

| Exception | Reason |
| --- | --- |
| ScrapingantInvalidTokenException | The API token is wrong or you have exceeded the API calls request limit
| ScrapingantInvalidTokenException | The API token is wrong or you have exceeded the API calls request limit
| ScrapingantInvalidInputException | Invalid value provided. Please, look into error message for more info |
| ScrapingantInternalException | Something went wrong with the server side code. Try again later or contact ScrapingAnt support |
| ScrapingantSiteNotReachableException | The requested URL is not reachable. Please, check it locally |
Expand All @@ -106,7 +124,7 @@ from scrapingant_client import Cookie
client = ScrapingAntClient(token='<YOUR-SCRAPINGANT-API-TOKEN>')

result = client.general_request(
'https://httpbin.org/cookies',
'https://httpbin.org/cookies',
cookies=[
Cookie(name='cookieName1', value='cookieVal1'),
Cookie(name='cookieName2', value='cookieVal2'),
Expand All @@ -122,6 +140,7 @@ response_cookies = result.cookies

```python
from scrapingant_client import ScrapingAntClient

client = ScrapingAntClient(token='<YOUR-SCRAPINGANT-API-TOKEN>')

customJsSnippet = """
Expand All @@ -130,7 +149,7 @@ var htmlElement = document.getElementsByTagName('html')[0];
htmlElement.innerHTML = str;
"""
result = client.general_request(
'https://example.com',
'https://example.com',
js_snippet=customJsSnippet,
)
print(result.content)
Expand All @@ -145,14 +164,16 @@ client = ScrapingAntClient(token='<YOUR-SCRAPINGANT-API-TOKEN>')

RETRIES_COUNT = 3


def parse_html(html: str):
... # Implement your data extraction here


parsed_data = None
for retry_number in range(RETRIES_COUNT):
try:
scrapingant_response = client.general_request(
'https://example.com',
'https://example.com',
)
except ScrapingantInvalidInputException as e:
print(f'Got invalid input exception: {{repr(e)}}')
Expand All @@ -167,7 +188,6 @@ for retry_number in range(RETRIES_COUNT):
break # Data is parsed successfully, so we dont need to retry
except Exception as e:
print(f'Got exception while parsing data {repr(e)}')


if parsed_data is None:
print(f'Failed to retrieve and parse data after {RETRIES_COUNT} tries')
Expand All @@ -184,7 +204,7 @@ from scrapingant_client import ScrapingAntClient
client = ScrapingAntClient(token='<YOUR-SCRAPINGANT-API-TOKEN>')

result = client.general_request(
'https://httpbin.org/headers',
'https://httpbin.org/headers',
headers={
'test-header': 'test-value'
}
Expand All @@ -193,13 +213,32 @@ print(result.content)

# Http basic auth example
result = client.general_request(
'https://jigsaw.w3.org/HTTP/Basic/',
'https://jigsaw.w3.org/HTTP/Basic/',
headers={'Authorization': 'Basic Z3Vlc3Q6Z3Vlc3Q='}
)
print(result.content)
```

### Simple async example

```python3
import asyncio

from scrapingant_client import ScrapingAntClient

client = ScrapingAntClient(token='<YOUR-SCRAPINGANT-API-TOKEN>')


async def main():
# Scrape the example.com site.
result = await client.general_request_async('https://example.com')
print(result.content)


asyncio.run(main())
```

## Useful links

- [Scrapingant API doumentation](https://docs.scrapingant.com)
- [Scrapingant JS Client](https://github.com/scrapingant/scrapingant-client-js)
2 changes: 1 addition & 1 deletion scrapingant_client/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.3.9"
__version__ = "1.0.0"

from scrapingant_client.client import ScrapingAntClient
from scrapingant_client.cookie import Cookie
Expand Down
107 changes: 87 additions & 20 deletions scrapingant_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,23 @@ def __init__(self, token: str):
self.token = token
self.requests_session = requests.Session()
version = scrapingant_client.__version__
user_agent = f'ScrapingAnt Client/{version} ({sys.platform}; Python/{platform.python_version()});'
self.user_agent = f'ScrapingAnt Client/{version} ({sys.platform}; Python/{platform.python_version()});'
self.requests_session.headers.update({
'x-api-key': self.token,
'User-Agent': user_agent,
'User-Agent': self.user_agent,
})

def general_request(
def _form_payload(
self,
url: str,
cookies: Optional[List[Cookie]] = None,
headers: Optional[Dict[str, str]] = None,
js_snippet: Optional[str] = None,
proxy_type: ProxyType = ProxyType.datacenter,
proxy_country: Optional[str] = None,
return_text: bool = False,
wait_for_selector: Optional[str] = None,
browser: bool = True,
) -> Response:
) -> Dict:
request_data = {'url': url}
if cookies is not None:
request_data['cookies'] = cookies_list_to_string(cookies)
Expand All @@ -56,29 +55,97 @@ def general_request(
request_data['wait_for_selector'] = wait_for_selector
request_data['return_text'] = return_text
request_data['browser'] = browser
return request_data

response = self.requests_session.post(
SCRAPINGANT_API_BASE_URL + '/general',
json=request_data,
headers=convert_headers(headers),
)
if response.status_code == 403:
def _parse_response(self, response_status_code: int, response_data: Dict, url: str) -> Response:
if response_status_code == 403:
raise ScrapingantInvalidTokenException()
elif response.status_code == 404:
elif response_status_code == 404:
raise ScrapingantSiteNotReachableException(url)
elif response.status_code == 422:
raise ScrapingantInvalidInputException(response.text)
elif response.status_code == 423:
elif response_status_code == 422:
raise ScrapingantInvalidInputException(response_data)
elif response_status_code == 423:
raise ScrapingantDetectedException()
elif response.status_code == 500:
elif response_status_code == 500:
raise ScrapingantInternalException()
json_response = response.json()
content = json_response['content']
cookies_string = json_response['cookies']
status_code = json_response['status_code']
content = response_data['content']
cookies_string = response_data['cookies']
status_code = response_data['status_code']
cookies_list = cookies_list_from_string(cookies_string)
return Response(
content=content,
cookies=cookies_list,
status_code=status_code
)

def general_request(
self,
url: str,
cookies: Optional[List[Cookie]] = None,
headers: Optional[Dict[str, str]] = None,
js_snippet: Optional[str] = None,
proxy_type: ProxyType = ProxyType.datacenter,
proxy_country: Optional[str] = None,
return_text: bool = False,
wait_for_selector: Optional[str] = None,
browser: bool = True,
) -> Response:
request_data = self._form_payload(
url=url,
cookies=cookies,
js_snippet=js_snippet,
proxy_type=proxy_type,
proxy_country=proxy_country,
return_text=return_text,
wait_for_selector=wait_for_selector,
browser=browser,
)
response = self.requests_session.post(
SCRAPINGANT_API_BASE_URL + '/general',
json=request_data,
headers=convert_headers(headers),
)
response_status_code = response.status_code
response_data = response.json()
parsed_response: Response = self._parse_response(response_status_code, response_data, url)
return parsed_response

async def general_request_async(
self,
url: str,
cookies: Optional[List[Cookie]] = None,
headers: Optional[Dict[str, str]] = None,
js_snippet: Optional[str] = None,
proxy_type: ProxyType = ProxyType.datacenter,
proxy_country: Optional[str] = None,
return_text: bool = False,
wait_for_selector: Optional[str] = None,
browser: bool = True,
) -> Response:
import httpx

request_data = self._form_payload(
url=url,
cookies=cookies,
js_snippet=js_snippet,
proxy_type=proxy_type,
proxy_country=proxy_country,
return_text=return_text,
wait_for_selector=wait_for_selector,
browser=browser,
)
async with httpx.AsyncClient(
headers={
'x-api-key': self.token,
'User-Agent': self.user_agent,
}
) as client:
response = await client.post(
SCRAPINGANT_API_BASE_URL + '/general',
json=request_data,
headers=convert_headers(headers),
)
response_status_code = response.status_code
response_data = response.json()
parsed_response: Response = self._parse_response(response_status_code, response_data, url)
return parsed_response
13 changes: 9 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,14 @@
install_requires=['requests>=2,<3'],
extras_require={
'dev': [
'pytest>=6,<7',
'flake8>=3,<4',
'responses>=0,<1'
]
'pytest>=7,<8',
'flake8>=4,<5',
'responses>=0,<1',
'pytest-httpx>=0,<1',
'pytest-asyncio>=0,<1',
],
'async': [
'httpx<1',
],
},
)
2 changes: 1 addition & 1 deletion tests/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_invalid_input():
client = ScrapingAntClient(token='some_token')
with pytest.raises(ScrapingantInvalidInputException) as e:
client.general_request('bad_url')
assert '{"detail": "wrong url"}' in str(e)
assert 'wrong url' in str(e)


@responses.activate
Expand Down
Loading

0 comments on commit 05f3948

Please sign in to comment.