修炼一途,乃窃阴阳,夺造化,转涅盘,握生死,掌轮回。 + 武之极,破苍穹,动乾坤! + 新书求收藏,求推荐,谢大家o(n_n)o~ +
+From 4e3fdca97327a3ccc10c5864624da4932f8eb975 Mon Sep 17 00:00:00 2001 From: liqiang <1144388620@qq.com> Date: Mon, 11 Jan 2021 11:17:07 +0800 Subject: [PATCH] init --- .gitignore | 133 +++ LICENSE | 201 ++++ Pipfile | 33 + Pipfile.lock | 278 +++++ launcher.py | 65 ++ readme.md | 7 + smart/__init__.py | 7 + smart/core.py | 229 ++++ smart/downloader.py | 189 ++++ smart/field.py | 2330 +++++++++++++++++++++++++++++++++++++++ smart/item.py | 196 ++++ smart/log.py | 194 ++++ smart/middlewire.py | 77 ++ smart/pipline.py | 44 + smart/request.py | 51 + smart/response.py | 110 ++ smart/runer.py | 170 +++ smart/scheduler.py | 99 ++ smart/setting.py | 42 + smart/spider.py | 69 ++ smart/tool.py | 2195 ++++++++++++++++++++++++++++++++++++ spiders/db/__init__.py | 8 + spiders/db/sanicdb.py | 176 +++ spiders/govs.py | 47 + spiders/ipspider.py | 21 + spiders/ipspider2.py | 131 +++ spiders/js/__init__.py | 8 + spiders/js/js_spider.py | 56 + spiders/json_spider.py | 36 + test/__init__.py | 38 + test/db_test.py | 73 ++ test/ruia_test.py | 73 ++ test/test.html | 304 +++++ test/uni_test.py | 59 + test/web.py | 55 + 35 files changed, 7804 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 launcher.py create mode 100644 readme.md create mode 100644 smart/__init__.py create mode 100644 smart/core.py create mode 100644 smart/downloader.py create mode 100644 smart/field.py create mode 100644 smart/item.py create mode 100644 smart/log.py create mode 100644 smart/middlewire.py create mode 100644 smart/pipline.py create mode 100644 smart/request.py create mode 100644 smart/response.py create mode 100644 smart/runer.py create mode 100644 smart/scheduler.py create mode 100644 smart/setting.py create mode 100644 smart/spider.py create mode 100644 smart/tool.py create mode 100644 spiders/db/__init__.py create mode 100644 spiders/db/sanicdb.py create mode 100644 spiders/govs.py create mode 100644 spiders/ipspider.py create mode 100644 spiders/ipspider2.py create mode 100644 spiders/js/__init__.py create mode 100644 spiders/js/js_spider.py create mode 100644 spiders/json_spider.py create mode 100644 test/__init__.py create mode 100644 test/db_test.py create mode 100644 test/ruia_test.py create mode 100644 test/test.html create mode 100644 test/uni_test.py create mode 100644 test/web.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2de55fd --- /dev/null +++ b/.gitignore @@ -0,0 +1,133 @@ +# Byte-compiled / optimized / DLL files +.idea/ +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.vscode +mydoc \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..d815e4b --- /dev/null +++ b/Pipfile @@ -0,0 +1,33 @@ +[[source]] +name = "pypi" +url = "https://mirrors.163.com/pypi/simple/" +verify_ssl = true + +[dev-packages] +#mypy = "*" +#fastapi = "*" +#uvicorn = "*" +#jinja2 = "*" +pytest = "*" + +[packages] +aiohttp = "*" +lxml = "*" +#bitarray = "*" +requests = "*" +fastapi = "*" +uvicorn = {extras = ["standard"],version = "*"} +python-multipart = "*" +ruia = "*" +ruia-ua = "*" +jsonpath = "*" +parsel = "*" +pytest = "*" +pyppeteer = "*" +pymysql = "*" +aiomysql = "*" +mkdocs = "*" +cchardet = "*" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..6efad0e --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,278 @@ +{ + "_meta": { + "hash": { + "sha256": "5154b1655506994ecfe02d649e521625a8e07831b60a5bfcba75e82c54d25212" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://mirrors.163.com/pypi/simple/", + "verify_ssl": true + } + ] + }, + "default": { + "aiohttp": { + "hashes": [ + "sha256:1e984191d1ec186881ffaed4581092ba04f7c61582a177b187d3a2f07ed9719e", + "sha256:259ab809ff0727d0e834ac5e8a283dc5e3e0ecc30c4d80b3cd17a4139ce1f326", + "sha256:2f4d1a4fdce595c947162333353d4a44952a724fba9ca3205a3df99a33d1307a", + "sha256:32e5f3b7e511aa850829fbe5aa32eb455e5534eaa4b1ce93231d00e2f76e5654", + "sha256:344c780466b73095a72c616fac5ea9c4665add7fc129f285fbdbca3cccf4612a", + "sha256:460bd4237d2dbecc3b5ed57e122992f60188afe46e7319116da5eb8a9dfedba4", + "sha256:4c6efd824d44ae697814a2a85604d8e992b875462c6655da161ff18fd4f29f17", + "sha256:50aaad128e6ac62e7bf7bd1f0c0a24bc968a0c0590a726d5a955af193544bcec", + "sha256:6206a135d072f88da3e71cc501c59d5abffa9d0bb43269a6dcd28d66bfafdbdd", + "sha256:65f31b622af739a802ca6fd1a3076fd0ae523f8485c52924a89561ba10c49b48", + "sha256:ae55bac364c405caa23a4f2d6cfecc6a0daada500274ffca4a9230e7129eac59", + "sha256:b778ce0c909a2653741cb4b1ac7015b5c130ab9c897611df43ae6a58523cb965" + ], + "index": "pypi", + "version": "==3.6.2" + }, + "async-timeout": { + "hashes": [ + "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f", + "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3" + ], + "version": "==3.0.1" + }, + "attrs": { + "hashes": [ + "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", + "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" + ], + "version": "==19.3.0" + }, + "certifi": { + "hashes": [ + "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c", + "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830" + ], + "version": "==2020.12.5" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "idna": { + "hashes": [ + "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", + "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" + ], + "version": "==2.9" + }, + "lxml": { + "hashes": [ + "sha256:05a444b207901a68a6526948c7cc8f9fe6d6f24c70781488e32fd74ff5996e3f", + "sha256:08fc93257dcfe9542c0a6883a25ba4971d78297f63d7a5a26ffa34861ca78730", + "sha256:107781b213cf7201ec3806555657ccda67b1fccc4261fb889ef7fc56976db81f", + "sha256:121b665b04083a1e85ff1f5243d4a93aa1aaba281bc12ea334d5a187278ceaf1", + "sha256:1fa21263c3aba2b76fd7c45713d4428dbcc7644d73dcf0650e9d344e433741b3", + "sha256:2b30aa2bcff8e958cd85d907d5109820b01ac511eae5b460803430a7404e34d7", + "sha256:4b4a111bcf4b9c948e020fd207f915c24a6de3f1adc7682a2d92660eb4e84f1a", + "sha256:5591c4164755778e29e69b86e425880f852464a21c7bb53c7ea453bbe2633bbe", + "sha256:59daa84aef650b11bccd18f99f64bfe44b9f14a08a28259959d33676554065a1", + "sha256:5a9c8d11aa2c8f8b6043d845927a51eb9102eb558e3f936df494e96393f5fd3e", + "sha256:5dd20538a60c4cc9a077d3b715bb42307239fcd25ef1ca7286775f95e9e9a46d", + "sha256:74f48ec98430e06c1fa8949b49ebdd8d27ceb9df8d3d1c92e1fdc2773f003f20", + "sha256:786aad2aa20de3dbff21aab86b2fb6a7be68064cbbc0219bde414d3a30aa47ae", + "sha256:7ad7906e098ccd30d8f7068030a0b16668ab8aa5cda6fcd5146d8d20cbaa71b5", + "sha256:80a38b188d20c0524fe8959c8ce770a8fdf0e617c6912d23fc97c68301bb9aba", + "sha256:8f0ec6b9b3832e0bd1d57af41f9238ea7709bbd7271f639024f2fc9d3bb01293", + "sha256:92282c83547a9add85ad658143c76a64a8d339028926d7dc1998ca029c88ea6a", + "sha256:94150231f1e90c9595ccc80d7d2006c61f90a5995db82bccbca7944fd457f0f6", + "sha256:9dc9006dcc47e00a8a6a029eb035c8f696ad38e40a27d073a003d7d1443f5d88", + "sha256:a76979f728dd845655026ab991df25d26379a1a8fc1e9e68e25c7eda43004bed", + "sha256:aa8eba3db3d8761db161003e2d0586608092e217151d7458206e243be5a43843", + "sha256:bea760a63ce9bba566c23f726d72b3c0250e2fa2569909e2d83cda1534c79443", + "sha256:c3f511a3c58676147c277eff0224c061dd5a6a8e1373572ac817ac6324f1b1e0", + "sha256:c9d317efde4bafbc1561509bfa8a23c5cab66c44d49ab5b63ff690f5159b2304", + "sha256:cc411ad324a4486b142c41d9b2b6a722c534096963688d879ea6fa8a35028258", + "sha256:cdc13a1682b2a6241080745b1953719e7fe0850b40a5c71ca574f090a1391df6", + "sha256:cfd7c5dd3c35c19cec59c63df9571c67c6d6e5c92e0fe63517920e97f61106d1", + "sha256:e1cacf4796b20865789083252186ce9dc6cc59eca0c2e79cca332bdff24ac481", + "sha256:e70d4e467e243455492f5de463b72151cc400710ac03a0678206a5f27e79ddef", + "sha256:ecc930ae559ea8a43377e8b60ca6f8d61ac532fc57efb915d899de4a67928efd", + "sha256:f161af26f596131b63b236372e4ce40f3167c1b5b5d459b29d2514bd8c9dc9ee" + ], + "index": "pypi", + "version": "==4.5.2" + }, + "multidict": { + "hashes": [ + "sha256:1ece5a3369835c20ed57adadc663400b5525904e53bae59ec854a5d36b39b21a", + "sha256:275ca32383bc5d1894b6975bb4ca6a7ff16ab76fa622967625baeebcf8079000", + "sha256:3750f2205b800aac4bb03b5ae48025a64e474d2c6cc79547988ba1d4122a09e2", + "sha256:4538273208e7294b2659b1602490f4ed3ab1c8cf9dbdd817e0e9db8e64be2507", + "sha256:5141c13374e6b25fe6bf092052ab55c0c03d21bd66c94a0e3ae371d3e4d865a5", + "sha256:51a4d210404ac61d32dada00a50ea7ba412e6ea945bbe992e4d7a595276d2ec7", + "sha256:5cf311a0f5ef80fe73e4f4c0f0998ec08f954a6ec72b746f3c179e37de1d210d", + "sha256:6513728873f4326999429a8b00fc7ceddb2509b01d5fd3f3be7881a257b8d463", + "sha256:7388d2ef3c55a8ba80da62ecfafa06a1c097c18032a501ffd4cabbc52d7f2b19", + "sha256:9456e90649005ad40558f4cf51dbb842e32807df75146c6d940b6f5abb4a78f3", + "sha256:c026fe9a05130e44157b98fea3ab12969e5b60691a276150db9eda71710cd10b", + "sha256:d14842362ed4cf63751648e7672f7174c9818459d169231d03c56e84daf90b7c", + "sha256:e0d072ae0f2a179c375f67e3da300b47e1a83293c554450b29c900e50afaae87", + "sha256:f07acae137b71af3bb548bd8da720956a3bc9f9a0b87733e0899226a2317aeb7", + "sha256:fbb77a75e529021e7c4a8d4e823d88ef4d23674a202be4f5addffc72cbb91430", + "sha256:fcfbb44c59af3f8ea984de67ec7c306f618a3ec771c2843804069917a8f2e255", + "sha256:feed85993dbdb1dbc29102f50bca65bdc68f2c0c8d352468c25b54874f23c39d" + ], + "version": "==4.7.6" + }, + "requests": { + "hashes": [ + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" + ], + "index": "pypi", + "version": "==2.23.0" + }, + "urllib3": { + "hashes": [ + "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527", + "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115" + ], + "version": "==1.25.9" + }, + "yarl": { + "hashes": [ + "sha256:0c2ab325d33f1b824734b3ef51d4d54a54e0e7a23d13b86974507602334c2cce", + "sha256:0ca2f395591bbd85ddd50a82eb1fde9c1066fafe888c5c7cc1d810cf03fd3cc6", + "sha256:2098a4b4b9d75ee352807a95cdf5f10180db903bc5b7270715c6bbe2551f64ce", + "sha256:25e66e5e2007c7a39541ca13b559cd8ebc2ad8fe00ea94a2aad28a9b1e44e5ae", + "sha256:26d7c90cb04dee1665282a5d1a998defc1a9e012fdca0f33396f81508f49696d", + "sha256:308b98b0c8cd1dfef1a0311dc5e38ae8f9b58349226aa0533f15a16717ad702f", + "sha256:3ce3d4f7c6b69c4e4f0704b32eca8123b9c58ae91af740481aa57d7857b5e41b", + "sha256:58cd9c469eced558cd81aa3f484b2924e8897049e06889e8ff2510435b7ef74b", + "sha256:5b10eb0e7f044cf0b035112446b26a3a2946bca9d7d7edb5e54a2ad2f6652abb", + "sha256:6faa19d3824c21bcbfdfce5171e193c8b4ddafdf0ac3f129ccf0cdfcb083e462", + "sha256:944494be42fa630134bf907714d40207e646fd5a94423c90d5b514f7b0713fea", + "sha256:a161de7e50224e8e3de6e184707476b5a989037dcb24292b391a3d66ff158e70", + "sha256:a4844ebb2be14768f7994f2017f70aca39d658a96c786211be5ddbe1c68794c1", + "sha256:c2b509ac3d4b988ae8769901c66345425e361d518aecbe4acbfc2567e416626a", + "sha256:c9959d49a77b0e07559e579f38b2f3711c2b8716b8410b320bf9713013215a1b", + "sha256:d8cdee92bc930d8b09d8bd2043cedd544d9c8bd7436a77678dd602467a993080", + "sha256:e15199cdb423316e15f108f51249e44eb156ae5dba232cb73be555324a1d49c2" + ], + "version": "==1.4.2" + } + }, + "develop": { + "jinja2": { + "hashes": [ + "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0", + "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035" + ], + "index": "pypi", + "version": "==2.11.2" + }, + "markupsafe": { + "hashes": [ + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" + ], + "version": "==1.1.1" + }, + "mypy": { + "hashes": [ + "sha256:0a0d102247c16ce93c97066443d11e2d36e6cc2a32d8ccc1f705268970479324", + "sha256:0d34d6b122597d48a36d6c59e35341f410d4abfa771d96d04ae2c468dd201abc", + "sha256:2170492030f6faa537647d29945786d297e4862765f0b4ac5930ff62e300d802", + "sha256:2842d4fbd1b12ab422346376aad03ff5d0805b706102e475e962370f874a5122", + "sha256:2b21ba45ad9ef2e2eb88ce4aeadd0112d0f5026418324176fd494a6824b74975", + "sha256:72060bf64f290fb629bd4a67c707a66fd88ca26e413a91384b18db3876e57ed7", + "sha256:af4e9ff1834e565f1baa74ccf7ae2564ae38c8df2a85b057af1dbbc958eb6666", + "sha256:bd03b3cf666bff8d710d633d1c56ab7facbdc204d567715cb3b9f85c6e94f669", + "sha256:c614194e01c85bb2e551c421397e49afb2872c88b5830e3554f0519f9fb1c178", + "sha256:cf4e7bf7f1214826cf7333627cb2547c0db7e3078723227820d0a2490f117a01", + "sha256:da56dedcd7cd502ccd3c5dddc656cb36113dd793ad466e894574125945653cea", + "sha256:e86bdace26c5fe9cf8cb735e7cedfe7850ad92b327ac5d797c656717d2ca66de", + "sha256:e97e9c13d67fbe524be17e4d8025d51a7dca38f90de2e462243ab8ed8a9178d1", + "sha256:eea260feb1830a627fb526d22fbb426b750d9f5a47b624e8d5e7e004359b219c" + ], + "index": "pypi", + "version": "==0.790" + }, + "mypy-extensions": { + "hashes": [ + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + ], + "version": "==0.4.3" + }, + "typed-ast": { + "hashes": [ + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" + ], + "version": "==1.4.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:6e95524d8a547a91e08f404ae485bbb71962de46967e1b71a0cb89af24e761c5", + "sha256:79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae", + "sha256:f8d2bd89d25bc39dabe7d23df520442fa1d8969b82544370e03d88b5a591c392" + ], + "version": "==3.7.4.2" + } + } +} diff --git a/launcher.py b/launcher.py new file mode 100644 index 0000000..db134b3 --- /dev/null +++ b/launcher.py @@ -0,0 +1,65 @@ +import asyncio +import atexit +import threading +import time +from multiprocessing.pool import Pool + +from smart.log import log +from smart.pipline import Piplines +from smart.runer import CrawStater +from spiders.db.sanicdb import SanicDB +from spiders.govs import GovsSpider, ArticelItem +from spiders.ipspider2 import IpSpider3, GovSpider, IpSpider, ApiSpider +from spiders.js.js_spider import JsSpider, Broswer +from spiders.json_spider import JsonSpider +from test import middleware2 + +piplinestest = Piplines() + + +@piplinestest.pipline(1) +async def do_pip(spider_ins, item): + return item + + +@piplinestest.pipline(2) +def do_pip2(spider_ins, item): + print(f"我是item2 {item.results}") + return item + + +db = SanicDB('localhost', 'testdb', 'root', 'root', + minsize=5, maxsize=55, + connect_timeout=10 + ) + + +@atexit.register +def when_end(): + global db + if db: + db.close() + + +@piplinestest.pipline(3) +async def to_mysql_db(spider_ins, item): + if item and isinstance(item, ArticelItem): + print(f"我是item3 入库 {item.results}") + global db + last_id = await db.table_insert("art", item.results) + print(f"last_id {last_id}") + + return item + + +def start1(): + starter = CrawStater() + starter.run_single(IpSpider(), middlewire=middleware2, pipline=piplinestest) + + +if __name__ == '__main__': + starter = CrawStater() + spider1 = GovsSpider() + spider2 = JsonSpider() + js_spider = JsSpider() + starter.run_many([IpSpider()], middlewire=middleware2, pipline=piplinestest) diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..7232ca0 --- /dev/null +++ b/readme.md @@ -0,0 +1,7 @@ +## Smart-spider +Smart-spider + + + + + diff --git a/smart/__init__.py b/smart/__init__.py new file mode 100644 index 0000000..eae9ee8 --- /dev/null +++ b/smart/__init__.py @@ -0,0 +1,7 @@ +# -*- coding utf-8 -*-# +# ------------------------------------------------------------------ +# Name: __init__.py +# Author: liangbaikai +# Date: 2020/12/21 +# Desc: there is smart-framework core package +# ------------------------------------------------------------------ \ No newline at end of file diff --git a/smart/core.py b/smart/core.py new file mode 100644 index 0000000..1cf5b73 --- /dev/null +++ b/smart/core.py @@ -0,0 +1,229 @@ +# -*- coding utf-8 -*-# +# ------------------------------------------------------------------ +# Name: core +# Author: liangbaikai +# Date: 2020/12/22 +# Desc: there is a python file description +# ------------------------------------------------------------------ +import asyncio +import importlib +import inspect +import time +import traceback +import uuid +from asyncio import Lock +from collections import deque +from contextlib import suppress +from typing import Dict + +from smart.log import log +from smart.downloader import Downloader +from smart.item import Item +from smart.pipline import Piplines +from smart.request import Request +from smart.scheduler import Scheduler +from smart.setting import gloable_setting_dict + + +class Engine: + def __init__(self, spider, middlewire=None, pipline: Piplines = None): + self.lock = None + self.task_dict: Dict[str, asyncio.Task] = {} + self.pip_task_dict: Dict[str, asyncio.Task] = {} + self.spider = spider + self.middlewire = middlewire + self.piplines = pipline + duplicate_filter_class = self._get_dynamic_class_setting("duplicate_filter_class") + scheduler_container_class = self._get_dynamic_class_setting("scheduler_container_class") + net_download_class = self._get_dynamic_class_setting("net_download_class") + self.scheduler = Scheduler(duplicate_filter_class(), scheduler_container_class()) + req_per_concurrent = self.spider.cutome_setting_dict.get("req_per_concurrent") or gloable_setting_dict.get( + "req_per_concurrent") + self.downloader = Downloader(self.scheduler, self.middlewire, seq=req_per_concurrent, + downer=net_download_class()) + self.request_generator_queue = deque() + self.stop = False + self.log = log + + def _get_dynamic_class_setting(self, key): + class_str = self.spider.cutome_setting_dict.get( + key) or gloable_setting_dict.get( + key) + _module = importlib.import_module(".".join(class_str.split(".")[:-1])) + _class = getattr(_module, class_str.split(".")[-1]) + return _class + + def iter_request(self): + while True: + if not self.request_generator_queue: + yield None + continue + request_generator = self.request_generator_queue[0] + spider, real_request_generator = request_generator[0], request_generator[1] + try: + # execute and get a request from cutomer code + # request=real_request_generator.send(None) + request_or_item = next(real_request_generator) + if isinstance(request_or_item, Request): + request_or_item.__spider__ = spider + except StopIteration: + self.request_generator_queue.popleft() + continue + except Exception as e: + # 可以处理异常 + self.request_generator_queue.popleft() + self._handle_exception(spider, e) + continue + yield request_or_item + + def _check_complete_pip(self, task): + if task.cancelled(): + self.log.debug(f" a task canceld ") + return + if task and task.done() and task._key: + if task.exception(): + self.log.error(f"a task occurer error in pipline {task.exception()} ") + else: + self.log.debug(f"a task done ") + result = task.result() + if result and isinstance(result, Item): + if hasattr(task, '_index'): + self._hand_piplines(task._spider, result, task._index + 1) + self.pip_task_dict.pop(task._key) + + def _check_complete_callback(self, task): + if task.cancelled(): + self.log.debug(f" a task canceld ") + return + if task and task.done() and task._key: + self.log.debug(f"a task done ") + self.task_dict.pop(task._key) + + async def start(self): + self.spider.on_start() + # self.spider + self.request_generator_queue.append((self.spider, iter(self.spider))) + # self.request_generator_queue.append( iter(self.spider)) + # core implenment + while not self.stop: + # paused + if self.lock and self.lock.locked(): + await asyncio.sleep(1) + continue + + request_or_item = next(self.iter_request()) + if isinstance(request_or_item, Request): + self.scheduler.schedlue(request_or_item) + + if isinstance(request_or_item, Item): + self._hand_piplines(self.spider, request_or_item) + + request = self.scheduler.get() + can_stop = self._check_can_stop(request) + # if request is None and not self.task_dict: + if can_stop: + # there is no request and the task has been completed.so ended + self.log.debug( + f" here is no request and the task has been completed.so engine will stop ..") + self.stop = True + break + if isinstance(request, Request): + self._ensure_future(request) + + resp = self.downloader.get() + + if resp is None: + # let the_downloader can be scheduled, test 0.001-0.0006 is better + await asyncio.sleep(0.0005) + continue + + custome_callback = resp.request.callback + if custome_callback: + request_generator = custome_callback(resp) + if request_generator: + self.request_generator_queue.append((custome_callback.__self__, request_generator)) + # self.request_generator_queue.append( request_generator) + if self.spider.state != "runing": + self.spider.state = "runing" + + self.spider.state = "closed" + self.spider.on_close() + self.log.debug(f" engine stoped..") + await asyncio.sleep(0.15) + + def pause(self): + self.log.info(f" out called pause.. so engine will pause.. ") + asyncio.create_task(self._lock()) + self.spider.state = "pause" + + def recover(self): + if self.lock and self.lock.locked(): + self.log.info(f" out called recover.. so engine will recover.. ") + self.lock.release() + + def close(self): + # can make external active end engine + self.stop = True + tasks = asyncio.all_tasks() + for it in tasks: + it.cancel() + asyncio.gather(*tasks, return_exceptions=True) + self.log.debug(f" out called stop.. so engine close.. ") + + async def _lock(self): + if self.lock is None: + self.lock = Lock() + await self.lock.acquire() + + def _ensure_future(self, request: Request): + # compatible py_3.6 + task = asyncio.ensure_future(self.downloader.download(request)) + key = str(uuid.uuid4()) + task._key = key + self.task_dict[key] = task + task.add_done_callback(self._check_complete_callback) + + def _handle_exception(self, spider, e): + if spider: + try: + self.log.error(f" occured exceptyion e {e} ", exc_info=True) + spider.on_exception_occured(e) + except BaseException: + pass + + def _check_can_stop(self, request): + if request: + return False + if len(self.task_dict) > 0: + return False + if len(self.request_generator_queue) > 0: + return False + if self.downloader.response_queue.qsize() > 0: + return False + if len(self.pip_task_dict) > 0: + return False + return True + + def _hand_piplines(self, spider_ins, item, index=0): + if self.piplines is None or len(self.piplines.piplines) <= 0: + self.log.info("get a item but can not find a piplinse to handle it so ignore it ") + return + + if len(self.piplines.piplines) < index + 1: + return + + pip = self.piplines.piplines[index][1] + + if not callable(pip): + return + + if not inspect.iscoroutinefunction(pip): + task = asyncio.get_running_loop().run_in_executor(None, pip, spider_ins, item) + else: + task = asyncio.ensure_future(pip(spider_ins, item)) + key = str(uuid.uuid4()) + task._key = key + task._index = index + task._spider = spider_ins + self.pip_task_dict[key] = task + task.add_done_callback(self._check_complete_pip) diff --git a/smart/downloader.py b/smart/downloader.py new file mode 100644 index 0000000..4be863f --- /dev/null +++ b/smart/downloader.py @@ -0,0 +1,189 @@ +# -*- coding utf-8 -*-# +# ------------------------------------------------------------------ +# Name: downloader +# Author: liangbaikai +# Date: 2020/12/21 +# Desc: there is a python file description +# ------------------------------------------------------------------ +import asyncio +import inspect +from abc import ABC, abstractmethod +from asyncio import Queue, QueueEmpty +from contextlib import suppress +from typing import Optional +import aiohttp +from concurrent.futures import TimeoutError + +from smart.log import log +from smart.middlewire import Middleware +from smart.response import Response +from smart.scheduler import Scheduler +from smart.setting import gloable_setting_dict +from .request import Request + + +class BaseDown(ABC): + + @abstractmethod + def fetch(self, request: Request) -> Response: + pass + + +# class RequestsDown(BaseDown): +# def fetch(self, request: Request) -> Response: +# import requests +# res = requests.get(request.url, +# timeout=request.timeout or 3, +# ) +# response = Response(body=res.content, request=request, +# headers=res.headers, +# cookies=res.cookies, +# status=res.status_code) +# return response + + +class AioHttpDown(BaseDown): + + async def fetch(self, request: Request) -> Response: + async with aiohttp.ClientSession() as clicnt: + resp = await clicnt.request(request.method, + request.url, + timeout=request.timeout or 10, + headers=request.header or {}, + cookies=request.cookies or {}, + data=request.data or {}, + **request.extras or {} + ) + byte_content = await resp.read() + headers = {} + if resp.headers: + headers = {k: v for k, v in resp.headers.items()} + response = Response(body=byte_content, + status=resp.status, + headers=headers, + cookies=resp.cookies + ) + return response + + +class Downloader: + + def __init__(self, scheduler: Scheduler, middwire: Middleware = None, seq=100, downer: BaseDown = AioHttpDown()): + self.log = log + self.scheduler = scheduler + self.middwire = middwire + self.response_queue: asyncio.Queue = Queue() + # the file handle opens too_much to report an error + self.semaphore = asyncio.Semaphore(seq) + # the real to fetch resource from internet + self.downer = downer + self.log.info(f" downer loaded {self.downer.__class__.__name__}") + async def download(self, request: Request): + spider = request.__spider__ + max_retry = spider.cutome_setting_dict.get("req_max_retry") or gloable_setting_dict.get( + "req_max_retry") + if max_retry <= 0: + raise ValueError("req_max_retry must >0") + header_dict = spider.cutome_setting_dict.get("default_headers") or gloable_setting_dict.get( + "default_headers") + req_timeout = request.timeout or spider.cutome_setting_dict.get("req_timeout") or gloable_setting_dict.get( + "req_timeout") + request.timeout = req_timeout + header = request.header or {} + request.header = header.update(header_dict) + request.header = header + ignore_response_codes = spider.cutome_setting_dict.get("ignore_response_codes") or gloable_setting_dict.get( + "ignore_response_codes") + req_delay = spider.cutome_setting_dict.get("req_delay") or gloable_setting_dict.get("req_delay") + + if request and request.retry >= max_retry: + # reached max retry times + self.log.error(f'reached max retry times... {request}') + return + request.retry = request.retry + 1 + # when canceled + loop = asyncio.get_running_loop() + if loop.is_closed() or not loop.is_running(): + self.log.warning(f'loop is closed in download') + return + with suppress(asyncio.CancelledError): + async with self.semaphore: + await self._before_fetch(request) + + fetch = self.downer.fetch + iscoroutinefunction = inspect.iscoroutinefunction(fetch) + # support sync or async request + try: + # req_delay + if req_delay > 0: + await asyncio.sleep(req_delay) + self.log.debug( + f"send a request: \r\n【 \r\n url: {request.url} \r\n method: {request.method} \r\n header: {request.header} \r\n 】") + # + if iscoroutinefunction: + response = await fetch(request) + else: + self.log.debug(f'fetch may be an snyc func so it will run in executor ') + response = await asyncio.get_event_loop() \ + .run_in_executor(None, fetch, request) + except TimeoutError as e: + # delay retry + self.scheduler.schedlue(request) + self.log.debug( + f'req to fetch is timeout now so this req will dely to sechdule for retry {request.url}') + return + except asyncio.CancelledError as e: + self.log.debug(f' task is cancel..') + return + except BaseException as e: + self.log.error(f'occured some exception in downloader e:{e}') + return + if response is None or not isinstance(response, Response): + self.log.error( + f'the downer {self.downer.__class__.__name__} fetch function must return a response,' + 'that is a no-null response, and response must be a ' + 'smart.Response instance or sub Response instance. ') + return + + if response.status not in ignore_response_codes: + await self._after_fetch(request, response) + + if response.status not in ignore_response_codes: + response.request = request + response.__spider__ = spider + await self.response_queue.put(response) + + def get(self) -> Optional[Response]: + with suppress(QueueEmpty): + return self.response_queue.get_nowait() + + async def _before_fetch(self, request): + if self.middwire and len(self.middwire.request_middleware) > 0: + for item_tuple in self.middwire.request_middleware: + user_func = item_tuple[1] + if callable(user_func): + try: + # res not used + if inspect.iscoroutinefunction(user_func): + res = await user_func(request.__spider__, request) + else: + res = await asyncio.get_event_loop() \ + .run_in_executor(None, user_func, request.__spider__, request) + except Exception as e: + self.log.error(f"in middwire,before do send a request occured an error: {e}", exc_info=True) + return + + async def _after_fetch(self, request, response): + if response and self.middwire and len(self.middwire.response_middleware) > 0: + for item_tuple in self.middwire.response_middleware: + if callable(item_tuple[1]): + try: + # res not used + if inspect.iscoroutinefunction(item_tuple[1]): + res = await item_tuple[1](request.__spider__, request, response) + else: + res = await asyncio.get_event_loop() \ + .run_in_executor(None, item_tuple[1], request.__spider__, request, response) + except Exception as e: + self.log.error(f"in middwire,after a request sended, occured an error: {e}", exc_info=True) + return diff --git a/smart/field.py b/smart/field.py new file mode 100644 index 0000000..14e77a7 --- /dev/null +++ b/smart/field.py @@ -0,0 +1,2330 @@ +# -*- coding utf-8 -*-# +# ------------------------------------------------------------------ +# Name: item +# Author: liangbaikai +# Date: 2020/12/31 +# Desc: there is a python file description +# ------------------------------------------------------------------ +import json +import re +from abc import abstractmethod, ABC +from typing import Union, Iterable, Callable, Any + +import jsonpath +from lxml import etree +from lxml.etree import _ElementUnicodeResult + + +class BaseField: + + def __init__(self, default=None, many: bool = False): + self.default = default + self.many = many + + def extract(self, *args, **kwargs): + ... + + +class _LxmlElementField(BaseField): + def __init__( + self, + css_select: str = None, + xpath_select: str = None, + default='', + many: bool = False, + ): + """ + :param css_select: css select http://lxml.de/cssselect.html + :param xpath_select: http://www.w3school.com.cn/xpath/index.asp + :param default: inherit + :param many: inherit + """ + super(_LxmlElementField, self).__init__(default=default, many=many) + self.css_select = css_select + self.xpath_select = xpath_select + + def _get_elements(self, *, html_etree: etree._Element): + if self.css_select: + elements = html_etree.cssselect(self.css_select) + elif self.xpath_select: + elements = html_etree.xpath(self.xpath_select) + else: + raise ValueError( + f"{self.__class__.__name__} field: css_select or xpath_select is expected." + ) + if not self.many: + elements = elements[:1] + return elements + + def _parse_element(self, element): + raise NotImplementedError + + def extract(self, html: Union[etree._Element, str]): + if html is None: + raise ValueError("html_etree can not be null..") + + if html and not isinstance(html, etree._Element): + html = etree.HTML(html) + + elements = self._get_elements(html_etree=html) + + # if is_source: + # return elements if self.many else elements[0] + + if elements: + results = [self._parse_element(element) for element in elements] + elif self.default is None: + raise ValueError( + f"Extract `{self.css_select or self.xpath_select}` error, " + "please check selector or set parameter named `default`" + ) + else: + results = self.default if type(self.default) == list else [self.default] + + return results if self.many else results[0] + + +class AttrField(_LxmlElementField): + """ + This field is used to get attribute. + """ + + def __init__( + self, + attr, + css_select: str = None, + xpath_select: str = None, + default="", + many: bool = False, + ): + super(AttrField, self).__init__( + css_select=css_select, xpath_select=xpath_select, default=default, many=many + ) + self.attr = attr + + def _parse_element(self, element): + return element.get(self.attr, self.default) + + +class ElementField(_LxmlElementField): + """ + This field is used to get LXML element(s). + """ + + def _parse_element(self, element): + return element + + +class HtmlField(_LxmlElementField): + """ + This field is used to get raw html data. + """ + + def _parse_element(self, element): + if element is None: + return None + if isinstance(element, _ElementUnicodeResult): + res = element.encode("utf-8").decode(encoding="utf-8") + else: + res = etree.tostring(element, encoding="utf-8").decode(encoding="utf-8") + if res: + res = res.strip() + return res + + +class TextField(_LxmlElementField): + """ + This field is used to get text. + """ + + def _parse_element(self, element): + # Extract text appropriately based on it's type + if isinstance(element, etree._ElementUnicodeResult): + strings = [node for node in element] + else: + strings = [node for node in element.itertext()] + + string = "".join(strings) + return string if string else self.default + + +class JsonPathField(BaseField): + def __init__(self, json_path: str, default="", many: bool = False): + super(JsonPathField, self).__init__(default=default, many=many) + self._json_path = json_path + + def extract(self, html: Union[str, dict, etree._Element]): + if isinstance(html, etree._Element): + html = etree.tostring(html).decode(encoding="utf-8") + if isinstance(html, str) or isinstance(html, etree._Element): + html = json.loads(html) + json_loads = html + res = jsonpath.jsonpath(json_loads, self._json_path) + if isinstance(res, bool) and not res: + return self.default + if self.many: + if isinstance(res, Iterable): + return res + else: + return [res] + else: + if isinstance(res, Iterable) and not isinstance(res, str): + return res[0] + else: + return res + + +class RegexField(BaseField): + """ + This field is used to get raw html code by regular expression. + RegexField uses standard library `re` inner, that is to say it has a better performance than _LxmlElementField. + """ + + def __init__(self, re_select: str, re_flags=0, default="", many: bool = False): + super(RegexField, self).__init__(default=default, many=many) + self._re_select = re_select + self._re_object = re.compile(self._re_select, flags=re_flags) + + def _parse_match(self, match): + if not match: + if self.default is not None: + return self.default + else: + raise ValueError( + f"Extract `{self._re_select}` error, can not founded " + f"please check selector or set parameter named `default`" + ) + else: + string = match.group() + groups = match.groups() + group_dict = match.groupdict() + if group_dict: + return group_dict + if groups: + return groups[0] if len(groups) == 1 else groups + return string + + def extract(self, html: Union[str, dict, etree._Element]): + if isinstance(html, etree._Element): + html = etree.tostring(html).decode(encoding="utf-8") + if isinstance(html, dict): + html = json.dumps(html, ensure_ascii=False) + if self.many: + matches = self._re_object.finditer(html) + return [self._parse_match(match) for match in matches] + else: + match = self._re_object.search(html) + return self._parse_match(match) + + +class FuncField(BaseField): + def __init__(self, call: Callable, name: str, default="", many: bool = False): + super(FuncField, self).__init__(default=default, many=many) + self._callable = call + if not callable(self._callable): + raise TypeError("callable param need a function or cab be called") + self._name = name + + def extract(self, html: Any): + res = self._callable(html, self._name) + if self.many: + if isinstance(res, Iterable): + return res + else: + return [res] + else: + if isinstance(res, Iterable) and not isinstance(res, str): + return res[0] + else: + return res + + +if __name__ == '__main__': + html = """ + + + +
+ + + +修炼一途,乃窃阴阳,夺造化,转涅盘,握生死,掌轮回。 + 武之极,破苍穹,动乾坤! + 新书求收藏,求推荐,谢大家o(n_n)o~ +
+