diff --git a/LICENSE b/LICENSE
index 261eeb9..d1ef075 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.
- Copyright [yyyy] [name of copyright owner]
+ Copyright [2021] [smart-spider,liangbaikai]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/Pipfile b/Pipfile
index d815e4b..ab405cb 100644
--- a/Pipfile
+++ b/Pipfile
@@ -4,30 +4,22 @@ url = "https://mirrors.163.com/pypi/simple/"
verify_ssl = true
[dev-packages]
-#mypy = "*"
-#fastapi = "*"
-#uvicorn = "*"
-#jinja2 = "*"
pytest = "*"
-
+mkdocs = "*"
+pymysql = "*"
+aiomysql = "*"
+pyppeteer = "*"
+ruia = "*"
+ruia-ua = "*"
+requests = "*"
+fastapi = "*"
[packages]
aiohttp = "*"
lxml = "*"
-#bitarray = "*"
-requests = "*"
-fastapi = "*"
uvicorn = {extras = ["standard"],version = "*"}
python-multipart = "*"
-ruia = "*"
-ruia-ua = "*"
jsonpath = "*"
parsel = "*"
-pytest = "*"
-pyppeteer = "*"
-pymysql = "*"
-aiomysql = "*"
-mkdocs = "*"
cchardet = "*"
-
[requires]
python_version = "3.7"
diff --git a/launcher.py b/launcher.py
index db134b3..3442bb3 100644
--- a/launcher.py
+++ b/launcher.py
@@ -9,6 +9,7 @@
from smart.runer import CrawStater
from spiders.db.sanicdb import SanicDB
from spiders.govs import GovsSpider, ArticelItem
+from spiders.image_spider import ImageSpider
from spiders.ipspider2 import IpSpider3, GovSpider, IpSpider, ApiSpider
from spiders.js.js_spider import JsSpider, Broswer
from spiders.json_spider import JsonSpider
@@ -23,7 +24,7 @@ async def do_pip(spider_ins, item):
@piplinestest.pipline(2)
-def do_pip2(spider_ins, item):
+def pip2(spider_ins, item):
print(f"我是item2 {item.results}")
return item
@@ -62,4 +63,4 @@ def start1():
spider1 = GovsSpider()
spider2 = JsonSpider()
js_spider = JsSpider()
- starter.run_many([IpSpider()], middlewire=middleware2, pipline=piplinestest)
+ starter.run_many([spider1], middlewire=middleware2, pipline=piplinestest)
diff --git a/smart/core.py b/smart/core.py
index 1cf5b73..21a0b79 100644
--- a/smart/core.py
+++ b/smart/core.py
@@ -8,12 +8,9 @@
import asyncio
import importlib
import inspect
-import time
-import traceback
import uuid
from asyncio import Lock
from collections import deque
-from contextlib import suppress
from typing import Dict
from smart.log import log
@@ -101,9 +98,7 @@ def _check_complete_callback(self, task):
async def start(self):
self.spider.on_start()
- # self.spider
self.request_generator_queue.append((self.spider, iter(self.spider)))
- # self.request_generator_queue.append( iter(self.spider))
# core implenment
while not self.stop:
# paused
@@ -120,7 +115,6 @@ async def start(self):
request = self.scheduler.get()
can_stop = self._check_can_stop(request)
- # if request is None and not self.task_dict:
if can_stop:
# there is no request and the task has been completed.so ended
self.log.debug(
@@ -134,7 +128,7 @@ async def start(self):
if resp is None:
# let the_downloader can be scheduled, test 0.001-0.0006 is better
- await asyncio.sleep(0.0005)
+ await asyncio.sleep(0.005)
continue
custome_callback = resp.request.callback
@@ -148,8 +142,9 @@ async def start(self):
self.spider.state = "closed"
self.spider.on_close()
- self.log.debug(f" engine stoped..")
+ # wait some resource to freed
await asyncio.sleep(0.15)
+ self.log.debug(f" engine stoped..")
def pause(self):
self.log.info(f" out called pause.. so engine will pause.. ")
diff --git a/smart/downloader.py b/smart/downloader.py
index 4be863f..ce16b70 100644
--- a/smart/downloader.py
+++ b/smart/downloader.py
@@ -29,31 +29,20 @@ def fetch(self, request: Request) -> Response:
pass
-# class RequestsDown(BaseDown):
-# def fetch(self, request: Request) -> Response:
-# import requests
-# res = requests.get(request.url,
-# timeout=request.timeout or 3,
-# )
-# response = Response(body=res.content, request=request,
-# headers=res.headers,
-# cookies=res.cookies,
-# status=res.status_code)
-# return response
-
-
class AioHttpDown(BaseDown):
async def fetch(self, request: Request) -> Response:
- async with aiohttp.ClientSession() as clicnt:
- resp = await clicnt.request(request.method,
- request.url,
- timeout=request.timeout or 10,
- headers=request.header or {},
- cookies=request.cookies or {},
- data=request.data or {},
- **request.extras or {}
- )
+ session = None
+ try:
+ session = request.session or aiohttp.ClientSession()
+ resp = await session.request(request.method,
+ request.url,
+ timeout=request.timeout or 10,
+ headers=request.header or {},
+ cookies=request.cookies or {},
+ data=request.data or {},
+ **request.extras or {}
+ )
byte_content = await resp.read()
headers = {}
if resp.headers:
@@ -63,6 +52,9 @@ async def fetch(self, request: Request) -> Response:
headers=headers,
cookies=resp.cookies
)
+ finally:
+ if request.session is None and session:
+ await session.close()
return response
@@ -78,6 +70,7 @@ def __init__(self, scheduler: Scheduler, middwire: Middleware = None, seq=100, d
# the real to fetch resource from internet
self.downer = downer
self.log.info(f" downer loaded {self.downer.__class__.__name__}")
+
async def download(self, request: Request):
spider = request.__spider__
max_retry = spider.cutome_setting_dict.get("req_max_retry") or gloable_setting_dict.get(
diff --git a/smart/field.py b/smart/field.py
index 14e77a7..78b0fb3 100644
--- a/smart/field.py
+++ b/smart/field.py
@@ -7,7 +7,6 @@
# ------------------------------------------------------------------
import json
import re
-from abc import abstractmethod, ABC
from typing import Union, Iterable, Callable, Any
import jsonpath
@@ -240,2091 +239,4 @@ def extract(self, html: Any):
if __name__ == '__main__':
- html = """
-
-
-
-
-
-
-
-武动乾坤小说_天蚕土豆_武动乾坤最新章节_武动乾坤无弹窗_新笔趣阁
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- """
- res = AttrField("href", css_select='#list > dl > dd:nth-child(1) > a').extract(html)
- print(res)
+ pass
diff --git a/smart/log.py b/smart/log.py
index a9a1230..1d6ad52 100644
--- a/smart/log.py
+++ b/smart/log.py
@@ -14,7 +14,7 @@
from smart.setting import gloable_setting_dict
LOG_FORMAT = "process %(process)d|thread %(threadName)s|%(asctime)s|%(filename)s|%(funcName)s|line:%(lineno)d|%(levelname)s: %(message)s"
-CONSOLE_LOG_FORMAT = "%(colorName)sprocess %(process)d|thread %(threadName)s|%(asctime)s|%(filename)s|%(funcName)s|line:%(lineno)d|%(levelname)s: %(message)s %(colorNameSuffix)s"
+CONSOLE_LOG_FORMAT = f"%(colorName)s{LOG_FORMAT} %(colorNameSuffix)s"
PRINT_EXCEPTION_DETAILS = True
@@ -78,10 +78,9 @@ class MyStreamHandler(logging.StreamHandler):
def emit(self, record):
if record.levelname in ["ERROR", "CRITICAL"]:
record.colorName = "\033[0;31m "
- record.colorNameSuffix = " \033[0m"
else:
record.colorName = "\033[0;34m "
- record.colorNameSuffix = " \033[0m"
+ record.colorNameSuffix = " \033[0m"
super().emit(record)
diff --git a/smart/request.py b/smart/request.py
index 0c0a1a6..f94ef51 100644
--- a/smart/request.py
+++ b/smart/request.py
@@ -5,8 +5,8 @@
# Date: 2020/12/21
# Desc: there is a python file description
# ------------------------------------------------------------------
-from dataclasses import dataclass, field, InitVar
-from typing import Callable
+from dataclasses import dataclass, InitVar
+from typing import Callable, Any
from smart.tool import is_valid_url
@@ -15,6 +15,7 @@
class Request:
url: InitVar[str]
callback: Callable = None
+ session: Any = None
method: str = 'get'
timeout: float = None
# if None will auto detect encoding
@@ -33,6 +34,10 @@ class Request:
_retry: int = 0
def __post_init__(self, url):
+ if url is None or url == '':
+ raise ValueError("request url can not be empty ")
+ if url and not (url.startswith("http") or url.startswith("ftp")):
+ url = "http://" + url
if is_valid_url(url):
self.url = url
else:
diff --git a/smart/response.py b/smart/response.py
index bd5a468..1d9ee55 100644
--- a/smart/response.py
+++ b/smart/response.py
@@ -84,12 +84,23 @@ def selector(self) -> Selector:
def content(self) -> bytes:
return self.body
+ @property
+ def content_type(self) -> Optional[str]:
+ if self.headers:
+ for key in self.headers.keys():
+ if "content_type" == key.lower():
+ return self.headers.get(key)
+ return None
+
@property
def text(self) -> Optional[str]:
if not self.body:
return None
# if request encoding is none and then auto detect encoding
self.request.encoding = self.encoding or cchardet.detect(self.body)["encoding"]
+ if self.request.encoding is None:
+ raise UnicodeDecodeError(
+ "body can not detect an encoding,it may be a binary data or you can set request.encoding to try it ")
# minimum possible may be UnicodeDecodeError
return self.body.decode(self.encoding)
diff --git a/smart/runer.py b/smart/runer.py
index b77c94b..b024d43 100644
--- a/smart/runer.py
+++ b/smart/runer.py
@@ -23,6 +23,14 @@
from smart.spider import Spider
from smart.tool import is_valid_url
+try:
+ # uvloop performance is better on linux..
+ import uvloop
+
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+except ImportError:
+ pass
+
class CrawStater:
__version = "0.1.0"
@@ -32,8 +40,6 @@ def __init__(self, loop=None):
# avoid a certain extent: too many files error
loop = loop or asyncio.ProactorEventLoop()
else:
- # uvloop performance is better on linux..
- # todo use uvloop
self.loop = loop or asyncio.new_event_loop()
thread_pool_max_size = gloable_setting_dict.get(
"thread_pool_max_size", 30)
@@ -91,7 +97,6 @@ def run(self, spider_module: str, spider_names: List[str] = [], middlewire: Midd
self.spider_names.append(_spider.name)
self._run()
-
def stop(self):
self.log.info(f'warning stop be called, {",".join(self.spider_names)} will stop ')
for core in self.cores:
@@ -128,8 +133,7 @@ def _run(self):
except BaseException as e3:
self.log.error(f" in loop, occured BaseException e {e3} ", exc_info=True)
- self.log.info(f'craw succeed {",".join(self.spider_names)} ended.. it cost {round(time.time() - start,3)} s')
-
+ self.log.info(f'craw succeed {",".join(self.spider_names)} ended.. it cost {round(time.time() - start, 3)} s')
def _print_info(self):
self.log.info("good luck!")
@@ -148,9 +152,8 @@ def _print_info(self):
)
self.log.info(" \r\n smart-spider-framework"
f"\r\n os: {sys.platform}"
- " \r\n author: liangbaikai"
- " \r\n emial:1144388620@qq.com"
- " \r\n version: 0.1.0"
+ " \r\n author: liangbaikai<1144388620@qq.com>"
+ f" \r\n version: {self.__version}"
" \r\n proverbs: whatever is worth doing is worth doing well."
)
diff --git a/smart/setting.py b/smart/setting.py
index 15f0af3..7eb2920 100644
--- a/smart/setting.py
+++ b/smart/setting.py
@@ -17,6 +17,10 @@
"req_max_retry": 3,
# 默认请求头
"default_headers": {
+ "Accept": "*/*;",
+ "Accept-Encoding": "gzip, deflate",
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+ # 百度搜索引擎爬虫ua
"user-agent": "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
},
# 请求url 去重处理器
@@ -32,11 +36,11 @@
"thread_pool_max_size": 50,
# 根据响应的状态码 忽略以下响应
"ignore_response_codes": [401, 403, 404, 405, 500, 502, 504],
- # 网络是否畅通检查地址
+ # 启动时网络是否畅通检查地址
"net_healthy_check_url": "https://www.baidu.com",
# log level
"log_level": "info",
"log_name": "smart-spider",
- "log_path": "D://test//smart.log",
+ "log_path": ".logs/smart.log",
"is_write_to_file": False,
}
diff --git a/smart/tool.py b/smart/tool.py
index b56dc90..3002fae 100644
--- a/smart/tool.py
+++ b/smart/tool.py
@@ -1,573 +1,20 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-09-06 14:21
----------
-@summary: 工具
----------
-@author: Boris
-@email: boris@bzkj.tech
-"""
-import calendar
-import codecs
-import configparser # 读配置文件的
-import datetime
-import functools
-import hashlib
-import html
-import json
-import os
-import pickle
-import random
import re
import socket
-import ssl
-import string
-import sys
-import time
-import traceback
import urllib
-import urllib.parse
-import uuid
-from hashlib import md5
-from pprint import pformat
-from pprint import pprint
-from urllib import request
-from urllib.parse import urljoin
-# import execjs # pip install PyExecJS
-# import redis
-import requests
-import six
-from requests.cookies import RequestsCookieJar
-from w3lib.url import canonicalize_url as sort_url
+RE_COMPILE = re.compile("(^https?:/{2}\w.+$)|(ftp://)")
-# import spider.setting as setting
-from smart.log import log
-os.environ["EXECJS_RUNTIME"] = "Node" # 设置使用node执行js
-
-# 全局取消ssl证书验证
-ssl._create_default_https_context = ssl._create_unverified_context
-
-TIME_OUT = 30
-TIMER_TIME = 5
-
-redisdb = None
-
-CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')
-
-
-# def get_redisdb():
-# global redisdb
-# if not redisdb:
-# ip, port = setting.REDISDB_IP_PORTS.split(":")
-# redisdb = redis.Redis(
-# host=ip,
-# port=port,
-# db=setting.REDISDB_DB,
-# password=setting.REDISDB_USER_PASS,
-# decode_responses=True,
-# ) # redis默认端口是6379
-# return redisdb
-
-
-# 装饰器
-def log_function_time(func):
- try:
-
- @functools.wraps(func) # 将函数的原来属性付给新函数
- def calculate_time(*args, **kw):
- began_time = time.time()
- callfunc = func(*args, **kw)
- end_time = time.time()
- log.debug(func.__name__ + " run time = " + str(end_time - began_time))
- return callfunc
-
- return calculate_time
- except:
- log.debug("求取时间无效 因为函数参数不符")
- return func
-
-
-def run_safe_model(module_name):
- def inner_run_safe_model(func):
- try:
-
- @functools.wraps(func) # 将函数的原来属性付给新函数
- def run_func(*args, **kw):
- callfunc = None
- try:
- callfunc = func(*args, **kw)
- except Exception as e:
- log.error(module_name + ": " + func.__name__ + " - " + str(e))
- traceback.print_exc()
- return callfunc
-
- return run_func
- except Exception as e:
- log.error(module_name + ": " + func.__name__ + " - " + str(e))
- traceback.print_exc()
- return func
-
- return inner_run_safe_model
-
-
-########################【网页解析相关】###############################
-
-
-# @log_function_time
-def get_html_by_requests(
- url, headers=None, code="utf-8", data=None, proxies={}, with_response=False
-):
- html = ""
- r = None
- try:
- if data:
- r = requests.post(
- url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies
- )
- else:
- r = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies)
-
- if code:
- r.encoding = code
- html = r.text
-
- except Exception as e:
- log.error(e)
- finally:
- r and r.close()
-
- if with_response:
- return html, r
- else:
- return html
-
-
-def get_json_by_requests(
- url,
- params=None,
- headers=None,
- data=None,
- proxies={},
- with_response=False,
- cookies=None,
-):
- json = {}
- response = None
- try:
- # response = requests.get(url, params = params)
- if data:
- response = requests.post(
- url,
- headers=headers,
- data=data,
- params=params,
- timeout=TIME_OUT,
- proxies=proxies,
- cookies=cookies,
- )
- else:
- response = requests.get(
- url,
- headers=headers,
- params=params,
- timeout=TIME_OUT,
- proxies=proxies,
- cookies=cookies,
- )
- response.encoding = "utf-8"
- json = response.json()
- except Exception as e:
- log.error(e)
- finally:
- response and response.close()
-
- if with_response:
- return json, response
- else:
- return json
-
-
-def get_cookies(response):
- cookies = requests.utils.dict_from_cookiejar(response.cookies)
- return cookies
-
-
-def get_cookies_jar(cookies):
- """
- @summary: 适用于selenium生成的cookies转requests的cookies
- requests.get(xxx, cookies=jar)
- 参考:https://www.cnblogs.com/small-bud/p/9064674.html
-
- ---------
- @param cookies: [{},{}]
- ---------
- @result: cookie jar
- """
-
- cookie_jar = RequestsCookieJar()
- for cookie in cookies:
- cookie_jar.set(cookie["name"], cookie["value"])
-
- return cookie_jar
-
-
-def get_cookies_from_selenium_cookie(cookies):
- """
- @summary: 适用于selenium生成的cookies转requests的cookies
- requests.get(xxx, cookies=jar)
- 参考:https://www.cnblogs.com/small-bud/p/9064674.html
-
- ---------
- @param cookies: [{},{}]
- ---------
- @result: cookie jar
- """
-
- cookie_dict = {}
- for cookie in cookies:
- if cookie.get("name"):
- cookie_dict[cookie["name"]] = cookie["value"]
-
- return cookie_dict
-
-
-def cookiesjar2str(cookies):
- str_cookie = ""
- for k, v in requests.utils.dict_from_cookiejar(cookies).items():
- str_cookie += k
- str_cookie += "="
- str_cookie += v
- str_cookie += "; "
- return str_cookie
-
-
-def cookies2str(cookies):
- str_cookie = ""
- for k, v in cookies.items():
- str_cookie += k
- str_cookie += "="
- str_cookie += v
- str_cookie += "; "
- return str_cookie
-
-
-def get_urls(
- html,
- stop_urls=(
- "javascript",
- "+",
- ".css",
- ".js",
- ".rar",
- ".xls",
- ".exe",
- ".apk",
- ".doc",
- ".jpg",
- ".png",
- ".flv",
- ".mp4",
- ),
-):
- # 不匹配javascript、 +、 # 这样的url
- regex = r'
>> string_camelcase('lost-pound')
- 'LostPound'
-
- >>> string_camelcase('missing_images')
- 'MissingImages'
-
- """
- return CAMELCASE_INVALID_CHARS.sub('', string.title())
-
-
-def get_full_url(root_url, sub_url):
- """
- @summary: 得到完整的ur
- ---------
- @param root_url: 根url (网页的url)
- @param sub_url: 子url (带有相对路径的 可以拼接成完整的)
- ---------
- @result: 返回完整的url
- """
-
- return urljoin(root_url, sub_url)
-
-
-def joint_url(url, params):
- # param_str = "?"
- # for key, value in params.items():
- # value = isinstance(value, str) and value or str(value)
- # param_str += key + "=" + value + "&"
- #
- # return url + param_str[:-1]
-
- if not params:
- return url
-
- params = urlencode(params)
- separator = "?" if "?" not in url else "&"
- return url + separator + params
-
-
-def canonicalize_url(url):
- """
- url 归一化 会参数排序 及去掉锚点
- """
- return sort_url(url)
-
-
-def get_url_md5(url):
- url = canonicalize_url(url)
- url = re.sub("^http://", "https://", url)
- return get_md5(url)
-
-
-def fit_url(urls, identis):
- identis = isinstance(identis, str) and [identis] or identis
- fit_urls = []
- for link in urls:
- for identi in identis:
- if identi in link:
- fit_urls.append(link)
- return list(set(fit_urls))
-
-
-def get_param(url, key):
- params = url.split("?")[-1].split("&")
- for param in params:
- key_value = param.split("=", 1)
- if key == key_value[0]:
- return key_value[1]
- return None
-
-
-def urlencode(params):
- """
- 字典类型的参数转为字符串
- @param params:
- {
- 'a': 1,
- 'b': 2
- }
- @return: a=1&b=2
- """
- return urllib.parse.urlencode(params)
-
-
-def urldecode(url):
- """
- 将字符串类型的参数转为json
- @param url: xxx?a=1&b=2
- @return:
- {
- 'a': 1,
- 'b': 2
- }
- """
- params_json = {}
- params = url.split("?")[-1].split("&")
- for param in params:
- key, value = param.split("=")
- params_json[key] = unquote_url(value)
-
- return params_json
-
-
-def unquote_url(url, encoding="utf-8"):
- """
- @summary: 将url解码
- ---------
- @param url:
- ---------
- @result:
- """
-
- return urllib.parse.unquote(url, encoding=encoding)
-
-
-def quote_url(url, encoding="utf-8"):
- """
- @summary: 将url编码 编码意思http://www.w3school.com.cn/tags/html_ref_urlencode.html
- ---------
- @param url:
- ---------
- @result:
- """
-
- return urllib.parse.quote(url, safe="%;/?:@&=+$,", encoding=encoding)
-
-
-def quote_chinese_word(text, encoding="utf-8"):
- def quote_chinese_word_func(text):
- chinese_word = text.group(0)
- return urllib.parse.quote(chinese_word, encoding=encoding)
-
- return re.sub("([\u4e00-\u9fa5]+)", quote_chinese_word_func, text, flags=re.S)
-
-
-def unescape(str):
- """
- 反转译
- """
- return html.unescape(str)
-
-
-def excape(str):
- """
- 转译
- """
- return html.escape(str)
-
-
-_regexs = {}
-
-
-# @log_function_time
-def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
- regexs = isinstance(regexs, str) and [regexs] or regexs
-
- infos = []
- for regex in regexs:
- if regex == "":
- continue
-
- if regex not in _regexs.keys():
- _regexs[regex] = re.compile(regex, re.S)
-
- if fetch_one:
- infos = _regexs[regex].search(html)
- if infos:
- infos = infos.groups()
- else:
- continue
- else:
- infos = _regexs[regex].findall(str(html))
-
- if len(infos) > 0:
- # print(regex)
- break
-
- if fetch_one:
- infos = infos if infos else ("",)
- return infos if len(infos) > 1 else infos[0]
- else:
- infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
- infos = split.join(infos) if split else infos
- return infos
-
-
-def table_json(table, save_one_blank=True):
- """
- 将表格转为json 适应于 key:value 在一行类的表格
- @param table: 使用selector封装后的具有xpath的selector
- @param save_one_blank: 保留一个空白符
- @return:
- """
- data = {}
-
- trs = table.xpath(".//tr")
- for tr in trs:
- tds = tr.xpath("./td|./th")
-
- for i in range(0, len(tds), 2):
- if i + 1 > len(tds) - 1:
- break
-
- key = tds[i].xpath("string(.)").extract_first(default="").strip()
- value = tds[i + 1].xpath("string(.)").extract_first(default="").strip()
- value = replace_str(value, "[\f\n\r\t\v]", "")
- value = replace_str(value, " +", " " if save_one_blank else "")
-
- if key:
- data[key] = value
-
- return data
-
-
-def get_table_row_data(table):
- """
- 获取表格里每一行数据
- @param table: 使用selector封装后的具有xpath的selector
- @return: [[],[]..]
- """
-
- datas = []
- rows = table.xpath(".//tr")
- for row in rows:
- cols = row.xpath("./td|./th")
- row_datas = []
- for col in cols:
- data = col.xpath("string(.)").extract_first(default="").strip()
- row_datas.append(data)
- datas.append(row_datas)
-
- return datas
-
-
-def rows2json(rows, keys=None):
- """
- 将行数据转为json
- @param rows: 每一行的数据
- @param keys: json的key,空时将rows的第一行作为key
- @return:
- """
- data_start_pos = 0 if keys else 1
- datas = []
- keys = keys or rows[0]
- for values in rows[data_start_pos:]:
- datas.append(dict(zip(keys, values)))
-
- return datas
-
-
-def get_form_data(form):
+def is_valid_url(url):
"""
- 提取form中提交的数据
- :param form: 使用selector封装后的具有xpath的selector
+ 验证url是否合法
+ :param url:
:return:
"""
- data = {}
- inputs = form.xpath(".//input")
- for input in inputs:
- name = input.xpath("./@name").extract_first()
- value = input.xpath("./@value").extract_first()
- if name:
- data[name] = value
-
- return data
-
-
-# mac上不好使
-# def get_domain(url):
-# domain = ''
-# try:
-# domain = get_tld(url)
-# except Exception as e:
-# log.debug(e)
-# return domain
+ if RE_COMPILE.match(url):
+ return True
+ else:
+ return False
def get_domain(url):
@@ -601,1595 +48,3 @@ def get_localhost_ip():
s.close()
return ip
-
-
-def ip_to_num(ip):
- import struct
-
- ip_num = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0])
- return ip_num
-
-
-def is_valid_proxy(proxy, check_url=None):
- """
- 检验代理是否有效
- @param proxy: xxx.xxx.xxx:xxx
- @param check_url: 利用目标网站检查,目标网站url。默认为None, 使用代理服务器的socket检查, 但不能排除Connection closed by foreign host
- @return: True / False
- """
- is_valid = False
-
- if check_url:
- proxies = {"http": f"http://{proxy}", "https": f"https://{proxy}"}
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
- }
- response = None
- try:
- response = requests.get(
- check_url, headers=headers, proxies=proxies, stream=True, timeout=20
- )
- is_valid = True
-
- except Exception as e:
- log.error("check proxy failed: {} {}".format(e, proxy))
-
- finally:
- if response:
- response.close()
-
- else:
- ip, port = proxy.split(":")
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
- sk.settimeout(7)
- try:
- sk.connect((ip, int(port))) # 检查代理服务器是否开着
- is_valid = True
-
- except Exception as e:
- log.error("check proxy failed: {} {}:{}".format(e, ip, port))
-
- return is_valid
-
-
-def is_valid_url(url):
- """
- 验证url是否合法
- :param url:
- :return:
- """
- if re.match(r"(^https?:/{2}\w.+$)|(ftp://)", url):
- return True
- else:
- return False
-
-
-def get_text(soup, *args):
- try:
- return soup.get_text()
- except Exception as e:
- log.error(e)
- return ""
-
-
-def del_html_tag(content, except_line_break=False, save_img=False, white_replaced=""):
- """
- 删除html标签
- @param content: html内容
- @param except_line_break: 保留p标签
- @param save_img: 保留图片
- @param white_replaced: 空白符替换
- @return:
- """
- content = replace_str(content, "(?i)") # (?)忽略大小写
- content = replace_str(content, "(?i)