bug fix

liangbaika · Jan 11, 2021 · 6cce944 · 6cce944
1 parent 95b0795
commit 6cce944
Show file tree

Hide file tree

Showing 15 changed files with 123 additions and 4,397 deletions.
diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright [2021] [smart-spider,liangbaikai]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/Pipfile b/Pipfile
@@ -4,30 +4,22 @@ url = "https://mirrors.163.com/pypi/simple/"
 verify_ssl = true
 
 [dev-packages]
-#mypy = "*"
-#fastapi = "*"
-#uvicorn = "*"
-#jinja2 = "*"
 pytest = "*"
-
+mkdocs = "*"
+pymysql = "*"
+aiomysql = "*"
+pyppeteer = "*"
+ruia = "*"
+ruia-ua = "*"
+requests = "*"
+fastapi = "*"
 [packages]
 aiohttp = "*"
 lxml = "*"
-#bitarray = "*"
-requests = "*"
-fastapi = "*"
 uvicorn = {extras = ["standard"],version = "*"}
 python-multipart = "*"
-ruia = "*"
-ruia-ua = "*"
 jsonpath = "*"
 parsel = "*"
-pytest = "*"
-pyppeteer = "*"
-pymysql = "*"
-aiomysql = "*"
-mkdocs = "*"
 cchardet = "*"
-
 [requires]
 python_version = "3.7"
diff --git a/launcher.py b/launcher.py
@@ -9,6 +9,7 @@
 from smart.runer import CrawStater
 from spiders.db.sanicdb import SanicDB
 from spiders.govs import GovsSpider, ArticelItem
+from spiders.image_spider import ImageSpider
 from spiders.ipspider2 import IpSpider3, GovSpider, IpSpider, ApiSpider
 from spiders.js.js_spider import JsSpider, Broswer
 from spiders.json_spider import JsonSpider
@@ -23,7 +24,7 @@ async def do_pip(spider_ins, item):
 
 
 @piplinestest.pipline(2)
-def do_pip2(spider_ins, item):
+def pip2(spider_ins, item):
     print(f"我是item2 {item.results}")
     return item
 
@@ -62,4 +63,4 @@ def start1():
     spider1 = GovsSpider()
     spider2 = JsonSpider()
     js_spider = JsSpider()
-    starter.run_many([IpSpider()], middlewire=middleware2, pipline=piplinestest)
+    starter.run_many([spider1], middlewire=middleware2, pipline=piplinestest)
diff --git a/smart/core.py b/smart/core.py
@@ -8,12 +8,9 @@
 import asyncio
 import importlib
 import inspect
-import time
-import traceback
 import uuid
 from asyncio import Lock
 from collections import deque
-from contextlib import suppress
 from typing import Dict
 
 from smart.log import log
@@ -101,9 +98,7 @@ def _check_complete_callback(self, task):
 
     async def start(self):
         self.spider.on_start()
-        # self.spider
         self.request_generator_queue.append((self.spider, iter(self.spider)))
-        # self.request_generator_queue.append( iter(self.spider))
         # core  implenment
         while not self.stop:
             # paused
@@ -120,7 +115,6 @@ async def start(self):
 
             request = self.scheduler.get()
             can_stop = self._check_can_stop(request)
-            # if request is None and not self.task_dict:
             if can_stop:
                 # there is no request and the task has been completed.so ended
                 self.log.debug(
@@ -134,7 +128,7 @@ async def start(self):
 
             if resp is None:
                 # let the_downloader can be scheduled, test 0.001-0.0006 is better
-                await asyncio.sleep(0.0005)
+                await asyncio.sleep(0.005)
                 continue
 
             custome_callback = resp.request.callback
@@ -148,8 +142,9 @@ async def start(self):
 
         self.spider.state = "closed"
         self.spider.on_close()
-        self.log.debug(f" engine stoped..")
+        # wait some resource to freed
         await asyncio.sleep(0.15)
+        self.log.debug(f" engine stoped..")
 
     def pause(self):
         self.log.info(f" out called pause.. so engine will pause.. ")

diff --git a/smart/downloader.py b/smart/downloader.py
@@ -29,31 +29,20 @@ def fetch(self, request: Request) -> Response:
         pass
 
 
-# class RequestsDown(BaseDown):
-#     def fetch(self, request: Request) -> Response:
-#         import requests
-#         res = requests.get(request.url,
-#                            timeout=request.timeout or 3,
-#                            )
-#         response = Response(body=res.content, request=request,
-#                             headers=res.headers,
-#                             cookies=res.cookies,
-#                             status=res.status_code)
-#         return response
-
-
 class AioHttpDown(BaseDown):
 
     async def fetch(self, request: Request) -> Response:
-        async with aiohttp.ClientSession() as clicnt:
-            resp = await clicnt.request(request.method,
-                                        request.url,
-                                        timeout=request.timeout or 10,
-                                        headers=request.header or {},
-                                        cookies=request.cookies or {},
-                                        data=request.data or {},
-                                        **request.extras or {}
-                                        )
+        session = None
+        try:
+            session = request.session or aiohttp.ClientSession()
+            resp = await session.request(request.method,
+                                         request.url,
+                                         timeout=request.timeout or 10,
+                                         headers=request.header or {},
+                                         cookies=request.cookies or {},
+                                         data=request.data or {},
+                                         **request.extras or {}
+                                         )
             byte_content = await resp.read()
             headers = {}
             if resp.headers:
@@ -63,6 +52,9 @@ async def fetch(self, request: Request) -> Response:
                                 headers=headers,
                                 cookies=resp.cookies
                                 )
+        finally:
+            if request.session is None and session:
+                await session.close()
         return response
 
 
@@ -78,6 +70,7 @@ def __init__(self, scheduler: Scheduler, middwire: Middleware = None, seq=100, d
         # the real to fetch resource from internet
         self.downer = downer
         self.log.info(f" downer loaded {self.downer.__class__.__name__}")
+
     async def download(self, request: Request):
         spider = request.__spider__
         max_retry = spider.cutome_setting_dict.get("req_max_retry") or gloable_setting_dict.get(