Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
liangbaika committed Jan 11, 2021
1 parent 95b0795 commit 6cce944
Show file tree
Hide file tree
Showing 15 changed files with 123 additions and 4,397 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright [2021] [smart-spider,liangbaikai]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
24 changes: 8 additions & 16 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,22 @@ url = "https://mirrors.163.com/pypi/simple/"
verify_ssl = true

[dev-packages]
#mypy = "*"
#fastapi = "*"
#uvicorn = "*"
#jinja2 = "*"
pytest = "*"

mkdocs = "*"
pymysql = "*"
aiomysql = "*"
pyppeteer = "*"
ruia = "*"
ruia-ua = "*"
requests = "*"
fastapi = "*"
[packages]
aiohttp = "*"
lxml = "*"
#bitarray = "*"
requests = "*"
fastapi = "*"
uvicorn = {extras = ["standard"],version = "*"}
python-multipart = "*"
ruia = "*"
ruia-ua = "*"
jsonpath = "*"
parsel = "*"
pytest = "*"
pyppeteer = "*"
pymysql = "*"
aiomysql = "*"
mkdocs = "*"
cchardet = "*"

[requires]
python_version = "3.7"
5 changes: 3 additions & 2 deletions launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from smart.runer import CrawStater
from spiders.db.sanicdb import SanicDB
from spiders.govs import GovsSpider, ArticelItem
from spiders.image_spider import ImageSpider
from spiders.ipspider2 import IpSpider3, GovSpider, IpSpider, ApiSpider
from spiders.js.js_spider import JsSpider, Broswer
from spiders.json_spider import JsonSpider
Expand All @@ -23,7 +24,7 @@ async def do_pip(spider_ins, item):


@piplinestest.pipline(2)
def do_pip2(spider_ins, item):
def pip2(spider_ins, item):
print(f"我是item2 {item.results}")
return item

Expand Down Expand Up @@ -62,4 +63,4 @@ def start1():
spider1 = GovsSpider()
spider2 = JsonSpider()
js_spider = JsSpider()
starter.run_many([IpSpider()], middlewire=middleware2, pipline=piplinestest)
starter.run_many([spider1], middlewire=middleware2, pipline=piplinestest)
11 changes: 3 additions & 8 deletions smart/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,9 @@
import asyncio
import importlib
import inspect
import time
import traceback
import uuid
from asyncio import Lock
from collections import deque
from contextlib import suppress
from typing import Dict

from smart.log import log
Expand Down Expand Up @@ -101,9 +98,7 @@ def _check_complete_callback(self, task):

async def start(self):
self.spider.on_start()
# self.spider
self.request_generator_queue.append((self.spider, iter(self.spider)))
# self.request_generator_queue.append( iter(self.spider))
# core implenment
while not self.stop:
# paused
Expand All @@ -120,7 +115,6 @@ async def start(self):

request = self.scheduler.get()
can_stop = self._check_can_stop(request)
# if request is None and not self.task_dict:
if can_stop:
# there is no request and the task has been completed.so ended
self.log.debug(
Expand All @@ -134,7 +128,7 @@ async def start(self):

if resp is None:
# let the_downloader can be scheduled, test 0.001-0.0006 is better
await asyncio.sleep(0.0005)
await asyncio.sleep(0.005)
continue

custome_callback = resp.request.callback
Expand All @@ -148,8 +142,9 @@ async def start(self):

self.spider.state = "closed"
self.spider.on_close()
self.log.debug(f" engine stoped..")
# wait some resource to freed
await asyncio.sleep(0.15)
self.log.debug(f" engine stoped..")

def pause(self):
self.log.info(f" out called pause.. so engine will pause.. ")
Expand Down
37 changes: 15 additions & 22 deletions smart/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,20 @@ def fetch(self, request: Request) -> Response:
pass


# class RequestsDown(BaseDown):
# def fetch(self, request: Request) -> Response:
# import requests
# res = requests.get(request.url,
# timeout=request.timeout or 3,
# )
# response = Response(body=res.content, request=request,
# headers=res.headers,
# cookies=res.cookies,
# status=res.status_code)
# return response


class AioHttpDown(BaseDown):

async def fetch(self, request: Request) -> Response:
async with aiohttp.ClientSession() as clicnt:
resp = await clicnt.request(request.method,
request.url,
timeout=request.timeout or 10,
headers=request.header or {},
cookies=request.cookies or {},
data=request.data or {},
**request.extras or {}
)
session = None
try:
session = request.session or aiohttp.ClientSession()
resp = await session.request(request.method,
request.url,
timeout=request.timeout or 10,
headers=request.header or {},
cookies=request.cookies or {},
data=request.data or {},
**request.extras or {}
)
byte_content = await resp.read()
headers = {}
if resp.headers:
Expand All @@ -63,6 +52,9 @@ async def fetch(self, request: Request) -> Response:
headers=headers,
cookies=resp.cookies
)
finally:
if request.session is None and session:
await session.close()
return response


Expand All @@ -78,6 +70,7 @@ def __init__(self, scheduler: Scheduler, middwire: Middleware = None, seq=100, d
# the real to fetch resource from internet
self.downer = downer
self.log.info(f" downer loaded {self.downer.__class__.__name__}")

async def download(self, request: Request):
spider = request.__spider__
max_retry = spider.cutome_setting_dict.get("req_max_retry") or gloable_setting_dict.get(
Expand Down
Loading

0 comments on commit 6cce944

Please sign in to comment.