From a2df806c952ce5c7bef90599079a936ae727f7b2 Mon Sep 17 00:00:00 2001 From: liqiang <1144388620@qq.com> Date: Mon, 18 Jan 2021 11:15:04 +0800 Subject: [PATCH] fix --- Pipfile | 18 +- Pipfile.lock | 278 ---- launcher.py | 85 +- smart/core.py | 21 +- smart/downloader.py | 17 +- smart/item.py | 1 - smart/response.py | 2 +- smart/runer.py | 2 +- smart/scheduler.py | 12 +- smart/setting.py | 4 + smart/signal.py | 39 + smart/tool.py | 33 + spiders/distributed/__init__.py | 71 + spiders/govs.py | 7 +- spiders/ipspider2.py | 26 +- spiders/json_spider.py | 4 + test/__init__.py | 2 +- test/tool_full.py | 2152 +++++++++++++++++++++++++++++++ test/uni_test.py | 14 +- 19 files changed, 2482 insertions(+), 306 deletions(-) delete mode 100644 Pipfile.lock create mode 100644 smart/signal.py create mode 100644 spiders/distributed/__init__.py create mode 100644 test/tool_full.py diff --git a/Pipfile b/Pipfile index ab405cb..3e25885 100644 --- a/Pipfile +++ b/Pipfile @@ -1,9 +1,10 @@ [[source]] -name = "pypi" -url = "https://mirrors.163.com/pypi/simple/" verify_ssl = true +name = "pypi" +url = "https://mirrors.aliyun.com/pypi/simple/" [dev-packages] +aioredis = "*" pytest = "*" mkdocs = "*" pymysql = "*" @@ -12,14 +13,23 @@ pyppeteer = "*" ruia = "*" ruia-ua = "*" requests = "*" -fastapi = "*" +#fastapi = "*" + [packages] +aioredis = "==1.3.1" aiohttp = "*" lxml = "*" -uvicorn = {extras = ["standard"],version = "*"} +#uvicorn = {extras = ["standard"],version = "*"} python-multipart = "*" jsonpath = "*" parsel = "*" cchardet = "*" +aiomysql = "*" +pyppeteer = "*" +redis = "*" +pytest = "*" +blinker = "*" +ruia = "*" + [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 6efad0e..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,278 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "5154b1655506994ecfe02d649e521625a8e07831b60a5bfcba75e82c54d25212" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.7" - }, - "sources": [ - { - "name": "pypi", - "url": "https://mirrors.163.com/pypi/simple/", - "verify_ssl": true - } - ] - }, - "default": { - "aiohttp": { - "hashes": [ - "sha256:1e984191d1ec186881ffaed4581092ba04f7c61582a177b187d3a2f07ed9719e", - "sha256:259ab809ff0727d0e834ac5e8a283dc5e3e0ecc30c4d80b3cd17a4139ce1f326", - "sha256:2f4d1a4fdce595c947162333353d4a44952a724fba9ca3205a3df99a33d1307a", - "sha256:32e5f3b7e511aa850829fbe5aa32eb455e5534eaa4b1ce93231d00e2f76e5654", - "sha256:344c780466b73095a72c616fac5ea9c4665add7fc129f285fbdbca3cccf4612a", - "sha256:460bd4237d2dbecc3b5ed57e122992f60188afe46e7319116da5eb8a9dfedba4", - "sha256:4c6efd824d44ae697814a2a85604d8e992b875462c6655da161ff18fd4f29f17", - "sha256:50aaad128e6ac62e7bf7bd1f0c0a24bc968a0c0590a726d5a955af193544bcec", - "sha256:6206a135d072f88da3e71cc501c59d5abffa9d0bb43269a6dcd28d66bfafdbdd", - "sha256:65f31b622af739a802ca6fd1a3076fd0ae523f8485c52924a89561ba10c49b48", - "sha256:ae55bac364c405caa23a4f2d6cfecc6a0daada500274ffca4a9230e7129eac59", - "sha256:b778ce0c909a2653741cb4b1ac7015b5c130ab9c897611df43ae6a58523cb965" - ], - "index": "pypi", - "version": "==3.6.2" - }, - "async-timeout": { - "hashes": [ - "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f", - "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3" - ], - "version": "==3.0.1" - }, - "attrs": { - "hashes": [ - "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", - "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" - ], - "version": "==19.3.0" - }, - "certifi": { - "hashes": [ - "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c", - "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830" - ], - "version": "==2020.12.5" - }, - "chardet": { - "hashes": [ - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" - ], - "version": "==3.0.4" - }, - "idna": { - "hashes": [ - "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", - "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" - ], - "version": "==2.9" - }, - "lxml": { - "hashes": [ - "sha256:05a444b207901a68a6526948c7cc8f9fe6d6f24c70781488e32fd74ff5996e3f", - "sha256:08fc93257dcfe9542c0a6883a25ba4971d78297f63d7a5a26ffa34861ca78730", - "sha256:107781b213cf7201ec3806555657ccda67b1fccc4261fb889ef7fc56976db81f", - "sha256:121b665b04083a1e85ff1f5243d4a93aa1aaba281bc12ea334d5a187278ceaf1", - "sha256:1fa21263c3aba2b76fd7c45713d4428dbcc7644d73dcf0650e9d344e433741b3", - "sha256:2b30aa2bcff8e958cd85d907d5109820b01ac511eae5b460803430a7404e34d7", - "sha256:4b4a111bcf4b9c948e020fd207f915c24a6de3f1adc7682a2d92660eb4e84f1a", - "sha256:5591c4164755778e29e69b86e425880f852464a21c7bb53c7ea453bbe2633bbe", - "sha256:59daa84aef650b11bccd18f99f64bfe44b9f14a08a28259959d33676554065a1", - "sha256:5a9c8d11aa2c8f8b6043d845927a51eb9102eb558e3f936df494e96393f5fd3e", - "sha256:5dd20538a60c4cc9a077d3b715bb42307239fcd25ef1ca7286775f95e9e9a46d", - "sha256:74f48ec98430e06c1fa8949b49ebdd8d27ceb9df8d3d1c92e1fdc2773f003f20", - "sha256:786aad2aa20de3dbff21aab86b2fb6a7be68064cbbc0219bde414d3a30aa47ae", - "sha256:7ad7906e098ccd30d8f7068030a0b16668ab8aa5cda6fcd5146d8d20cbaa71b5", - "sha256:80a38b188d20c0524fe8959c8ce770a8fdf0e617c6912d23fc97c68301bb9aba", - "sha256:8f0ec6b9b3832e0bd1d57af41f9238ea7709bbd7271f639024f2fc9d3bb01293", - "sha256:92282c83547a9add85ad658143c76a64a8d339028926d7dc1998ca029c88ea6a", - "sha256:94150231f1e90c9595ccc80d7d2006c61f90a5995db82bccbca7944fd457f0f6", - "sha256:9dc9006dcc47e00a8a6a029eb035c8f696ad38e40a27d073a003d7d1443f5d88", - "sha256:a76979f728dd845655026ab991df25d26379a1a8fc1e9e68e25c7eda43004bed", - "sha256:aa8eba3db3d8761db161003e2d0586608092e217151d7458206e243be5a43843", - "sha256:bea760a63ce9bba566c23f726d72b3c0250e2fa2569909e2d83cda1534c79443", - "sha256:c3f511a3c58676147c277eff0224c061dd5a6a8e1373572ac817ac6324f1b1e0", - "sha256:c9d317efde4bafbc1561509bfa8a23c5cab66c44d49ab5b63ff690f5159b2304", - "sha256:cc411ad324a4486b142c41d9b2b6a722c534096963688d879ea6fa8a35028258", - "sha256:cdc13a1682b2a6241080745b1953719e7fe0850b40a5c71ca574f090a1391df6", - "sha256:cfd7c5dd3c35c19cec59c63df9571c67c6d6e5c92e0fe63517920e97f61106d1", - "sha256:e1cacf4796b20865789083252186ce9dc6cc59eca0c2e79cca332bdff24ac481", - "sha256:e70d4e467e243455492f5de463b72151cc400710ac03a0678206a5f27e79ddef", - "sha256:ecc930ae559ea8a43377e8b60ca6f8d61ac532fc57efb915d899de4a67928efd", - "sha256:f161af26f596131b63b236372e4ce40f3167c1b5b5d459b29d2514bd8c9dc9ee" - ], - "index": "pypi", - "version": "==4.5.2" - }, - "multidict": { - "hashes": [ - "sha256:1ece5a3369835c20ed57adadc663400b5525904e53bae59ec854a5d36b39b21a", - "sha256:275ca32383bc5d1894b6975bb4ca6a7ff16ab76fa622967625baeebcf8079000", - "sha256:3750f2205b800aac4bb03b5ae48025a64e474d2c6cc79547988ba1d4122a09e2", - "sha256:4538273208e7294b2659b1602490f4ed3ab1c8cf9dbdd817e0e9db8e64be2507", - "sha256:5141c13374e6b25fe6bf092052ab55c0c03d21bd66c94a0e3ae371d3e4d865a5", - "sha256:51a4d210404ac61d32dada00a50ea7ba412e6ea945bbe992e4d7a595276d2ec7", - "sha256:5cf311a0f5ef80fe73e4f4c0f0998ec08f954a6ec72b746f3c179e37de1d210d", - "sha256:6513728873f4326999429a8b00fc7ceddb2509b01d5fd3f3be7881a257b8d463", - "sha256:7388d2ef3c55a8ba80da62ecfafa06a1c097c18032a501ffd4cabbc52d7f2b19", - "sha256:9456e90649005ad40558f4cf51dbb842e32807df75146c6d940b6f5abb4a78f3", - "sha256:c026fe9a05130e44157b98fea3ab12969e5b60691a276150db9eda71710cd10b", - "sha256:d14842362ed4cf63751648e7672f7174c9818459d169231d03c56e84daf90b7c", - "sha256:e0d072ae0f2a179c375f67e3da300b47e1a83293c554450b29c900e50afaae87", - "sha256:f07acae137b71af3bb548bd8da720956a3bc9f9a0b87733e0899226a2317aeb7", - "sha256:fbb77a75e529021e7c4a8d4e823d88ef4d23674a202be4f5addffc72cbb91430", - "sha256:fcfbb44c59af3f8ea984de67ec7c306f618a3ec771c2843804069917a8f2e255", - "sha256:feed85993dbdb1dbc29102f50bca65bdc68f2c0c8d352468c25b54874f23c39d" - ], - "version": "==4.7.6" - }, - "requests": { - "hashes": [ - "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", - "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" - ], - "index": "pypi", - "version": "==2.23.0" - }, - "urllib3": { - "hashes": [ - "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527", - "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115" - ], - "version": "==1.25.9" - }, - "yarl": { - "hashes": [ - "sha256:0c2ab325d33f1b824734b3ef51d4d54a54e0e7a23d13b86974507602334c2cce", - "sha256:0ca2f395591bbd85ddd50a82eb1fde9c1066fafe888c5c7cc1d810cf03fd3cc6", - "sha256:2098a4b4b9d75ee352807a95cdf5f10180db903bc5b7270715c6bbe2551f64ce", - "sha256:25e66e5e2007c7a39541ca13b559cd8ebc2ad8fe00ea94a2aad28a9b1e44e5ae", - "sha256:26d7c90cb04dee1665282a5d1a998defc1a9e012fdca0f33396f81508f49696d", - "sha256:308b98b0c8cd1dfef1a0311dc5e38ae8f9b58349226aa0533f15a16717ad702f", - "sha256:3ce3d4f7c6b69c4e4f0704b32eca8123b9c58ae91af740481aa57d7857b5e41b", - "sha256:58cd9c469eced558cd81aa3f484b2924e8897049e06889e8ff2510435b7ef74b", - "sha256:5b10eb0e7f044cf0b035112446b26a3a2946bca9d7d7edb5e54a2ad2f6652abb", - "sha256:6faa19d3824c21bcbfdfce5171e193c8b4ddafdf0ac3f129ccf0cdfcb083e462", - "sha256:944494be42fa630134bf907714d40207e646fd5a94423c90d5b514f7b0713fea", - "sha256:a161de7e50224e8e3de6e184707476b5a989037dcb24292b391a3d66ff158e70", - "sha256:a4844ebb2be14768f7994f2017f70aca39d658a96c786211be5ddbe1c68794c1", - "sha256:c2b509ac3d4b988ae8769901c66345425e361d518aecbe4acbfc2567e416626a", - "sha256:c9959d49a77b0e07559e579f38b2f3711c2b8716b8410b320bf9713013215a1b", - "sha256:d8cdee92bc930d8b09d8bd2043cedd544d9c8bd7436a77678dd602467a993080", - "sha256:e15199cdb423316e15f108f51249e44eb156ae5dba232cb73be555324a1d49c2" - ], - "version": "==1.4.2" - } - }, - "develop": { - "jinja2": { - "hashes": [ - "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0", - "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035" - ], - "index": "pypi", - "version": "==2.11.2" - }, - "markupsafe": { - "hashes": [ - "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", - "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", - "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", - "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", - "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", - "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", - "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", - "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", - "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", - "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", - "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", - "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", - "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", - "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", - "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", - "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", - "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", - "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", - "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", - "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", - "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", - "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", - "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", - "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", - "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", - "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", - "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", - "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", - "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", - "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", - "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" - ], - "version": "==1.1.1" - }, - "mypy": { - "hashes": [ - "sha256:0a0d102247c16ce93c97066443d11e2d36e6cc2a32d8ccc1f705268970479324", - "sha256:0d34d6b122597d48a36d6c59e35341f410d4abfa771d96d04ae2c468dd201abc", - "sha256:2170492030f6faa537647d29945786d297e4862765f0b4ac5930ff62e300d802", - "sha256:2842d4fbd1b12ab422346376aad03ff5d0805b706102e475e962370f874a5122", - "sha256:2b21ba45ad9ef2e2eb88ce4aeadd0112d0f5026418324176fd494a6824b74975", - "sha256:72060bf64f290fb629bd4a67c707a66fd88ca26e413a91384b18db3876e57ed7", - "sha256:af4e9ff1834e565f1baa74ccf7ae2564ae38c8df2a85b057af1dbbc958eb6666", - "sha256:bd03b3cf666bff8d710d633d1c56ab7facbdc204d567715cb3b9f85c6e94f669", - "sha256:c614194e01c85bb2e551c421397e49afb2872c88b5830e3554f0519f9fb1c178", - "sha256:cf4e7bf7f1214826cf7333627cb2547c0db7e3078723227820d0a2490f117a01", - "sha256:da56dedcd7cd502ccd3c5dddc656cb36113dd793ad466e894574125945653cea", - "sha256:e86bdace26c5fe9cf8cb735e7cedfe7850ad92b327ac5d797c656717d2ca66de", - "sha256:e97e9c13d67fbe524be17e4d8025d51a7dca38f90de2e462243ab8ed8a9178d1", - "sha256:eea260feb1830a627fb526d22fbb426b750d9f5a47b624e8d5e7e004359b219c" - ], - "index": "pypi", - "version": "==0.790" - }, - "mypy-extensions": { - "hashes": [ - "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", - "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" - ], - "version": "==0.4.3" - }, - "typed-ast": { - "hashes": [ - "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", - "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", - "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", - "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", - "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", - "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", - "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", - "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", - "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", - "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", - "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", - "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", - "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", - "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", - "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", - "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", - "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", - "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", - "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", - "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", - "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" - ], - "version": "==1.4.1" - }, - "typing-extensions": { - "hashes": [ - "sha256:6e95524d8a547a91e08f404ae485bbb71962de46967e1b71a0cb89af24e761c5", - "sha256:79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae", - "sha256:f8d2bd89d25bc39dabe7d23df520442fa1d8969b82544370e03d88b5a591c392" - ], - "version": "==3.7.4.2" - } - } -} diff --git a/launcher.py b/launcher.py index 3442bb3..def6f28 100644 --- a/launcher.py +++ b/launcher.py @@ -7,6 +7,9 @@ from smart.log import log from smart.pipline import Piplines from smart.runer import CrawStater +from smart.setting import gloable_setting_dict +from smart.signal import reminder +from smart.spider import Spider from spiders.db.sanicdb import SanicDB from spiders.govs import GovsSpider, ArticelItem from spiders.image_spider import ImageSpider @@ -20,12 +23,19 @@ @piplinestest.pipline(1) async def do_pip(spider_ins, item): + print(f"我是item1111111 {item.results}") return item @piplinestest.pipline(2) def pip2(spider_ins, item): - print(f"我是item2 {item.results}") + print(f"我是item2222222 {item.results}") + return item + + +@piplinestest.pipline(3) +async def pip3(spider_ins, item): + print(f"我是item33333 {item.results}") return item @@ -57,10 +67,81 @@ def start1(): starter = CrawStater() starter.run_single(IpSpider(), middlewire=middleware2, pipline=piplinestest) +# +# @reminder.spider_start.connect +# def test1(sender, **kwargs): +# print("spider_start1") +# return 1222222 +# +# +# @reminder.spider_start.connect +# def test221(sender, **kwargs): +# print("spider_start2") +# return 33333333 +# +# +# @reminder.spider_execption.connect +# def test2(sender, **kwargs): +# print("spider_execption") +# +# +# @reminder.spider_close.connect +# def tes3t(sender, **kwargs): +# print("spider_close") +# +# +# @reminder.engin_start.connect +# def test4(sender, **kwargs): +# print("engin_start") +# +# +# @reminder.engin_idle.connect +# def test5(sender, **kwargs): +# print("engin_idle") +# +# +# @reminder.engin_close.connect +# def test6(sender, **kwargs): +# print("engin_close") +# +# +# @reminder.request_dropped.connect +# def test7(sender, **kwargs): +# print("spider_start") +# +# +# @reminder.request_scheduled.connect +# def test8(sender, **kwargs): +# print("request_scheduled") +# +# +# @reminder.response_received.connect +# def test9(sender, **kwargs): +# print("response_received") +# +# +# @reminder.response_downloaded.connect +# def test10(sender, **kwargs): +# print("response_downloaded") +# +# +# @reminder.item_dropped.connect +# def test11(sender, **kwargs): +# print("spider_start") + if __name__ == '__main__': starter = CrawStater() spider1 = GovsSpider() spider2 = JsonSpider() js_spider = JsSpider() - starter.run_many([spider1], middlewire=middleware2, pipline=piplinestest) + gloable_setting_dict.update( + duplicate_filter_class="spiders.distributed.RedisBaseDuplicateFilter", + scheduler_container_class="spiders.distributed.RedisSchuler", + pipline_is_paralleled=1 + ) + + spider = IpSpider() + # starter.run_many([spider], middlewire=middleware2, pipline=piplinestest) + starter.run_many([spider]) + diff --git a/smart/core.py b/smart/core.py index b9ea197..d066836 100644 --- a/smart/core.py +++ b/smart/core.py @@ -8,6 +8,7 @@ import asyncio import importlib import inspect +import time import uuid from asyncio import Lock from collections import deque @@ -105,7 +106,6 @@ async def start(self): if self.lock and self.lock.locked(): await asyncio.sleep(1) continue - request_or_item = next(self.iter_request()) if isinstance(request_or_item, Request): self.scheduler.schedlue(request_or_item) @@ -191,12 +191,27 @@ def _check_can_stop(self, request): return False if len(self.task_dict) > 0: return False - if len(self.request_generator_queue) > 0: + if len(self.pip_task_dict) > 0: + return False + if len(self.request_generator_queue) > 0 and self.scheduler.scheduler_container.size() > 0: return False if self.downloader.response_queue.qsize() > 0: return False - if len(self.pip_task_dict) > 0: + if self.scheduler.scheduler_container.size() > 0: return False + start = time.time() + while 1: + end = time.time() + if (end - start) > 1.0: + print("空转 超过10s 停止") + break + if self.scheduler.scheduler_container.size() <= 0: + time.sleep(0.05) + else: + return False + + pass + return True def _hand_piplines(self, spider_ins, item, index=0): diff --git a/smart/downloader.py b/smart/downloader.py index 640dd1e..ccfd1e7 100644 --- a/smart/downloader.py +++ b/smart/downloader.py @@ -19,6 +19,7 @@ from smart.response import Response from smart.scheduler import Scheduler from smart.setting import gloable_setting_dict +from smart.signal import Reminder from .request import Request @@ -32,6 +33,7 @@ def fetch(self, request: Request) -> Response: class AioHttpDown(BaseDown): async def fetch(self, request: Request) -> Response: + print('run') session = None resp = None try: @@ -64,8 +66,10 @@ async def fetch(self, request: Request) -> Response: class Downloader: - def __init__(self, scheduler: Scheduler, middwire: Middleware = None, seq=100, downer: BaseDown = AioHttpDown()): + def __init__(self, scheduler: Scheduler, middwire: Middleware = None, reminder=None, seq=100, + downer: BaseDown = AioHttpDown()): self.log = log + self.reminder = reminder self.scheduler = scheduler self.middwire = middwire self.response_queue: asyncio.Queue = Queue() @@ -92,9 +96,9 @@ async def download(self, request: Request): ignore_response_codes = spider.cutome_setting_dict.get("ignore_response_codes") or gloable_setting_dict.get( "ignore_response_codes") req_delay = spider.cutome_setting_dict.get("req_delay") or gloable_setting_dict.get("req_delay") - if request and request.retry >= max_retry: # reached max retry times + self.reminder.go(Reminder.request_dropped, request, scheduler=self.scheduler) self.log.error(f'reached max retry times... {request}') return request.retry = request.retry + 1 @@ -141,7 +145,7 @@ async def download(self, request: Request): 'that is a no-null response, and response must be a ' 'smart.Response instance or sub Response instance. ') return - + # self.reminder.go(Reminder.response_downloaded, response) if response.status not in ignore_response_codes: await self._after_fetch(request, response) @@ -149,10 +153,15 @@ async def download(self, request: Request): response.request = request response.__spider__ = spider await self.response_queue.put(response) + return response def get(self) -> Optional[Response]: with suppress(QueueEmpty): - return self.response_queue.get_nowait() + response = self.response_queue.get_nowait() + if response: + # self.reminder.go(Reminder.response_received, response) + pass + return response async def _before_fetch(self, request): if self.middwire and len(self.middwire.request_middleware) > 0: diff --git a/smart/item.py b/smart/item.py index 1bce3b3..ddbb9e2 100644 --- a/smart/item.py +++ b/smart/item.py @@ -12,7 +12,6 @@ from typing import Any, Union from lxml import etree -from ruia.exceptions import InvalidFuncType from smart.field import BaseField, RegexField, FuncField diff --git a/smart/response.py b/smart/response.py index 1d9ee55..3a7de68 100644 --- a/smart/response.py +++ b/smart/response.py @@ -88,7 +88,7 @@ def content(self) -> bytes: def content_type(self) -> Optional[str]: if self.headers: for key in self.headers.keys(): - if "content_type" == key.lower(): + if "content_type" == key.lower() or "content-type" == key.lower(): return self.headers.get(key) return None diff --git a/smart/runer.py b/smart/runer.py index b024d43..4b81ce9 100644 --- a/smart/runer.py +++ b/smart/runer.py @@ -16,7 +16,7 @@ from urllib.request import urlopen from smart.log import log -from smart.core import Engine +from smart.core2 import Engine from smart.middlewire import Middleware from smart.pipline import Piplines from smart.setting import gloable_setting_dict diff --git a/smart/scheduler.py b/smart/scheduler.py index a52353a..e86be80 100644 --- a/smart/scheduler.py +++ b/smart/scheduler.py @@ -24,6 +24,10 @@ def push(self, request: Request): def pop(self) -> Optional[Request]: pass + @abstractmethod + def size(self) -> int: + pass + class BaseDuplicateFilter(ABC): @@ -47,12 +51,12 @@ def __init__(self): def add(self, url): if url: - self.set_container.add(hash(url)) + self.set_container.add(url) def contains(self, url): if not url: return False - if hash(url) in self.set_container: + if url in self.set_container: return True return False @@ -61,6 +65,7 @@ def length(self): class DequeSchedulerContainer(BaseSchedulerContainer): + def __init__(self): self.url_queue = deque() @@ -72,6 +77,9 @@ def pop(self) -> Optional[Request]: return self.url_queue.popleft() return None + def size(self) -> int: + return len(self.url_queue) + class Scheduler: def __init__(self, duplicate_filter: BaseDuplicateFilter = None, diff --git a/smart/setting.py b/smart/setting.py index 7eb2920..37b06c2 100644 --- a/smart/setting.py +++ b/smart/setting.py @@ -36,6 +36,10 @@ "thread_pool_max_size": 50, # 根据响应的状态码 忽略以下响应 "ignore_response_codes": [401, 403, 404, 405, 500, 502, 504], + # 是否是分布式爬虫 + "is_single": 1, + # pipline之间 处理item 是否并行处理 默认 0 串行 1 并行 + "pipline_is_paralleled": 0, # 启动时网络是否畅通检查地址 "net_healthy_check_url": "https://www.baidu.com", # log level diff --git a/smart/signal.py b/smart/signal.py new file mode 100644 index 0000000..9ad9a4b --- /dev/null +++ b/smart/signal.py @@ -0,0 +1,39 @@ +# -*- coding utf-8 -*-# +# ------------------------------------------------------------------ +# Name: signal +# Author: liangbaikai +# Date: 2021/1/15 +# Desc: there is a python file description +# ------------------------------------------------------------------ +from blinker import Signal + + +class _Reminder: + spider_start = Signal("spider_start") + spider_execption = Signal("spider_execption") + spider_close = Signal("spider_close") + engin_start = Signal("engin_start") + engin_idle = Signal("engin_idle") + engin_close = Signal("engin_close") + request_dropped = Signal("request_dropped") + request_scheduled = Signal("request_scheduled") + response_received = Signal("response_received") + response_downloaded = Signal("response_downloaded") + item_dropped = Signal("item_dropped") + + def __init__(self, *args, **kwargs): + pass + + def go(self, signal: Signal, *args, **kwargs): + if signal is None: + raise ValueError("signal can not be null") + has_receivers = bool(signal.receivers) + if has_receivers: + try: + signal.send(*args, **kwargs) + except Exception as e: + pass + + +Reminder = _Reminder +reminder = Reminder() diff --git a/smart/tool.py b/smart/tool.py index 3002fae..d37e6ba 100644 --- a/smart/tool.py +++ b/smart/tool.py @@ -1,3 +1,4 @@ +import hashlib import re import socket import urllib @@ -48,3 +49,35 @@ def get_localhost_ip(): s.close() return ip + + +def get_md5(*args): + """ + @summary: 获取唯一的32位md5 + --------- + @param *args: 参与联合去重的值 + --------- + @result: 7c8684bcbdfcea6697650aa53d7b1405 + """ + + m = hashlib.md5() + for arg in args: + m.update(str(arg).encode()) + + return m.hexdigest() + + +# mutations +def mutations_bkdr_hash(value: str): + if value is None: + value = '' + if not isinstance(value, str): + value = str(value) + if len(value) >= 10000: + value = get_md5(value) + + seed = 131 + h = 0 + for v in value: + h = seed * h + ord(v) + return h & 0x7FFFFFFF diff --git a/spiders/distributed/__init__.py b/spiders/distributed/__init__.py new file mode 100644 index 0000000..6475c81 --- /dev/null +++ b/spiders/distributed/__init__.py @@ -0,0 +1,71 @@ +# -*- coding utf-8 -*-# +# ------------------------------------------------------------------ +# Name: __init__.py +# Author: liangbaikai +# Date: 2021/1/14 +# Desc: there is a python file description +# ------------------------------------------------------------------ +import base64 +import hashlib +import json +import pickle +import random +import threading +import time +from collections import deque +from typing import Optional +from smart.request import Request +from smart.scheduler import BaseDuplicateFilter, BaseSchedulerContainer +import redis # 导入redis 模块 + + +class RedisSchuler(BaseSchedulerContainer): + pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True) + + def __init__(self): + self.redis = redis.Redis(connection_pool=self.pool) + self.task_queue_name = "smart_spider_redis_task_queue" + # 需要保持session 的放在本地 或者序列化报错的request 的容器 + self.faults = deque() + self.ecodeing = "latin1" + + def push(self, request: Request): + try: + req_byte = pickle.dumps(request) + self.redis.rpush(self.task_queue_name, req_byte.decode(self.ecodeing)) + except Exception: + self.faults.append(request) + + def pop(self) -> Optional[Request]: + if len(self.faults) > 0: + req = self.faults.popleft() + if req: + return req + else: + code = self.redis.lpop(self.task_queue_name) + if code: + req_byte = code.encode(self.ecodeing) + req = pickle.loads(req_byte) + return req + return None + + def size(self) -> int: + return self.redis.llen(self.task_queue_name) + + +class RedisBaseDuplicateFilter(BaseDuplicateFilter): + pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True) + + def __init__(self): + self.redis = redis.Redis(connection_pool=self.pool) + self.filterset_name = "smart_spider_redis_repeat_set" + + def add(self, url): + if url: + self.redis.sadd(self.filterset_name, url) + + def contains(self, url): + return self.redis.sismember(self.filterset_name, url) + + def length(self): + return self.redis.scard(self.filterset_name) diff --git a/spiders/govs.py b/spiders/govs.py index 06c06c5..ece66c4 100644 --- a/spiders/govs.py +++ b/spiders/govs.py @@ -29,7 +29,12 @@ class GovsSpider(Spider): start_urls = [ "http://www.nea.gov.cn/policy/jd.htm" ] - + cutome_setting_dict = {**Spider.cutome_setting_dict, + # **{ + # "req_delay": 3, + # "req_per_concurrent": 3, + # } + } def parse(self, response: Response): selects_detail_urls = response.xpath( diff --git a/spiders/ipspider2.py b/spiders/ipspider2.py index 0aaddee..09aee62 100644 --- a/spiders/ipspider2.py +++ b/spiders/ipspider2.py @@ -1,9 +1,13 @@ +import asyncio import json import threading +from aiohttp import ClientSession + from smart.item import Item from smart.response import Response from smart.request import Request +from smart.signal import reminder from smart.spider import Spider @@ -15,18 +19,26 @@ class TestItem(Item): class IpSpider(Spider): name = 'ipspider2' start_urls = [] - cutome_setting_dict = {**Spider.cutome_setting_dict, **{"req_per_concurrent": 100}} + cutome_setting_dict = {**Spider.cutome_setting_dict, + # **{ + # "duplicate_filter_class": "spiders.distributed.RedisBaseDuplicateFilter", + # "scheduler_container_class": "spiders.distributed.RedisSchuler", + # "is_single": 0, + # } + } def start_requests(self): - for page in range(100): + for page in range(1010): url = f'http://exercise.kingname.info/exercise_middleware_ip/{page}' - # url = f'http://exercise.kingname.info/exercise_middleware_ip/{page}' - # url = 'http://fzggw.zj.gov.cn/art/2020/8/26/art_1621004_55344873.html' - url = 'https://s.bdstatic.com/common/openjs/amd/eslx.js' - yield Request(url, callback=self.parse, dont_filter=True, timeout=3) + yield Request(url, callback=self.parse, dont_filter=False, timeout=9) def parse(self, response: Response): - pass + print(response.status) + # item = TestItem.get_item("") + # yield item + + # for i in range(1000): + # yield Request(url="https://www.baidu.com?q=" + str(i), callback=self.parse2) # yield TestItem(response.text) # for page in range(10): # print(page) diff --git a/spiders/json_spider.py b/spiders/json_spider.py index caf43d1..76ad8a7 100644 --- a/spiders/json_spider.py +++ b/spiders/json_spider.py @@ -10,6 +10,7 @@ from smart.field import JsonPathField, RegexField from smart.item import Item +from smart.request import Request from smart.response import Response from smart.spider import Spider @@ -33,4 +34,7 @@ class JsonSpider(Spider): ] def parse(self, response: Response): + for i in range(300): + yield Request(url=response.url,dont_filter=True) yield from BidItem.get_items(response.text) + diff --git a/test/__init__.py b/test/__init__.py index 225220e..d86b0d6 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -15,7 +15,7 @@ class ReqInte: @staticmethod @middleware2.request(-1) def print_on_request1(spider_ins, request): - print(f"ReqInteReqInteReqInteReqInt{spider_ins.name} e#{request}######################") + print(f"###############{spider_ins.name} e#{request}######################") @middleware2.request(1) diff --git a/test/tool_full.py b/test/tool_full.py new file mode 100644 index 0000000..71b3274 --- /dev/null +++ b/test/tool_full.py @@ -0,0 +1,2152 @@ +# -*- coding: utf-8 -*- +""" +Created on 2018-09-06 14:21 +--------- +@summary: 工具 +--------- +@author: Boris +@email: boris@bzkj.tech +""" +import calendar +import codecs +import configparser # 读配置文件的 +import datetime +import functools +import hashlib +import html +import json +import os +import pickle +import random +import re +import socket +import ssl +import string +import sys +import time +import traceback +import urllib +import urllib.parse +import uuid +from hashlib import md5 +from pprint import pformat +from pprint import pprint +from urllib import request +from urllib.parse import urljoin + +# import execjs # pip install PyExecJS +# import redis +import requests +import six +from requests.cookies import RequestsCookieJar +from w3lib.url import canonicalize_url as sort_url + +# import spider.setting as setting +from smart.log import log + +os.environ["EXECJS_RUNTIME"] = "Node" # 设置使用node执行js + +# 全局取消ssl证书验证 +ssl._create_default_https_context = ssl._create_unverified_context + +TIME_OUT = 30 +TIMER_TIME = 5 + +redisdb = None + +CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]') + + +# def get_redisdb(): +# global redisdb +# if not redisdb: +# ip, port = setting.REDISDB_IP_PORTS.split(":") +# redisdb = redis.Redis( +# host=ip, +# port=port, +# db=setting.REDISDB_DB, +# password=setting.REDISDB_USER_PASS, +# decode_responses=True, +# ) # redis默认端口是6379 +# return redisdb + + +# 装饰器 +def log_function_time(func): + try: + + @functools.wraps(func) # 将函数的原来属性付给新函数 + def calculate_time(*args, **kw): + began_time = time.time() + callfunc = func(*args, **kw) + end_time = time.time() + log.debug(func.__name__ + " run time = " + str(end_time - began_time)) + return callfunc + + return calculate_time + except: + log.debug("求取时间无效 因为函数参数不符") + return func + + +def run_safe_model(module_name): + def inner_run_safe_model(func): + try: + + @functools.wraps(func) # 将函数的原来属性付给新函数 + def run_func(*args, **kw): + callfunc = None + try: + callfunc = func(*args, **kw) + except Exception as e: + log.error(module_name + ": " + func.__name__ + " - " + str(e)) + traceback.print_exc() + return callfunc + + return run_func + except Exception as e: + log.error(module_name + ": " + func.__name__ + " - " + str(e)) + traceback.print_exc() + return func + + return inner_run_safe_model + + +########################【网页解析相关】############################### + + +# @log_function_time +def get_html_by_requests( + url, headers=None, code="utf-8", data=None, proxies={}, with_response=False +): + html = "" + r = None + try: + if data: + r = requests.post( + url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies + ) + else: + r = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies) + + if code: + r.encoding = code + html = r.text + + except Exception as e: + log.error(e) + finally: + r and r.close() + + if with_response: + return html, r + else: + return html + + +def get_json_by_requests( + url, + params=None, + headers=None, + data=None, + proxies={}, + with_response=False, + cookies=None, +): + json = {} + response = None + try: + # response = requests.get(url, params = params) + if data: + response = requests.post( + url, + headers=headers, + data=data, + params=params, + timeout=TIME_OUT, + proxies=proxies, + cookies=cookies, + ) + else: + response = requests.get( + url, + headers=headers, + params=params, + timeout=TIME_OUT, + proxies=proxies, + cookies=cookies, + ) + response.encoding = "utf-8" + json = response.json() + except Exception as e: + log.error(e) + finally: + response and response.close() + + if with_response: + return json, response + else: + return json + + +def get_cookies(response): + cookies = requests.utils.dict_from_cookiejar(response.cookies) + return cookies + + +def get_cookies_jar(cookies): + """ + @summary: 适用于selenium生成的cookies转requests的cookies + requests.get(xxx, cookies=jar) + 参考:https://www.cnblogs.com/small-bud/p/9064674.html + + --------- + @param cookies: [{},{}] + --------- + @result: cookie jar + """ + + cookie_jar = RequestsCookieJar() + for cookie in cookies: + cookie_jar.set(cookie["name"], cookie["value"]) + + return cookie_jar + + +def get_cookies_from_selenium_cookie(cookies): + """ + @summary: 适用于selenium生成的cookies转requests的cookies + requests.get(xxx, cookies=jar) + 参考:https://www.cnblogs.com/small-bud/p/9064674.html + + --------- + @param cookies: [{},{}] + --------- + @result: cookie jar + """ + + cookie_dict = {} + for cookie in cookies: + if cookie.get("name"): + cookie_dict[cookie["name"]] = cookie["value"] + + return cookie_dict + + +def cookiesjar2str(cookies): + str_cookie = "" + for k, v in requests.utils.dict_from_cookiejar(cookies).items(): + str_cookie += k + str_cookie += "=" + str_cookie += v + str_cookie += "; " + return str_cookie + + +def cookies2str(cookies): + str_cookie = "" + for k, v in cookies.items(): + str_cookie += k + str_cookie += "=" + str_cookie += v + str_cookie += "; " + return str_cookie + + +def get_urls( + html, + stop_urls=( + "javascript", + "+", + ".css", + ".js", + ".rar", + ".xls", + ".exe", + ".apk", + ".doc", + ".jpg", + ".png", + ".flv", + ".mp4", + ), +): + # 不匹配javascript、 +、 # 这样的url + regex = r'>> string_camelcase('lost-pound') + 'LostPound' + + >>> string_camelcase('missing_images') + 'MissingImages' + + """ + return CAMELCASE_INVALID_CHARS.sub('', string.title()) + + +def get_full_url(root_url, sub_url): + """ + @summary: 得到完整的ur + --------- + @param root_url: 根url (网页的url) + @param sub_url: 子url (带有相对路径的 可以拼接成完整的) + --------- + @result: 返回完整的url + """ + + return urljoin(root_url, sub_url) + + +def joint_url(url, params): + # param_str = "?" + # for key, value in params.items(): + # value = isinstance(value, str) and value or str(value) + # param_str += key + "=" + value + "&" + # + # return url + param_str[:-1] + + if not params: + return url + + params = urlencode(params) + separator = "?" if "?" not in url else "&" + return url + separator + params + + +def canonicalize_url(url): + """ + url 归一化 会参数排序 及去掉锚点 + """ + return sort_url(url) + + +def get_url_md5(url): + url = canonicalize_url(url) + url = re.sub("^http://", "https://", url) + return get_md5(url) + + +def fit_url(urls, identis): + identis = isinstance(identis, str) and [identis] or identis + fit_urls = [] + for link in urls: + for identi in identis: + if identi in link: + fit_urls.append(link) + return list(set(fit_urls)) + + +def get_param(url, key): + params = url.split("?")[-1].split("&") + for param in params: + key_value = param.split("=", 1) + if key == key_value[0]: + return key_value[1] + return None + + +def urlencode(params): + """ + 字典类型的参数转为字符串 + @param params: + { + 'a': 1, + 'b': 2 + } + @return: a=1&b=2 + """ + return urllib.parse.urlencode(params) + + +def urldecode(url): + """ + 将字符串类型的参数转为json + @param url: xxx?a=1&b=2 + @return: + { + 'a': 1, + 'b': 2 + } + """ + params_json = {} + params = url.split("?")[-1].split("&") + for param in params: + key, value = param.split("=") + params_json[key] = unquote_url(value) + + return params_json + + +def unquote_url(url, encoding="utf-8"): + """ + @summary: 将url解码 + --------- + @param url: + --------- + @result: + """ + + return urllib.parse.unquote(url, encoding=encoding) + + +def quote_url(url, encoding="utf-8"): + """ + @summary: 将url编码 编码意思http://www.w3school.com.cn/tags/html_ref_urlencode.html + --------- + @param url: + --------- + @result: + """ + + return urllib.parse.quote(url, safe="%;/?:@&=+$,", encoding=encoding) + + +def quote_chinese_word(text, encoding="utf-8"): + def quote_chinese_word_func(text): + chinese_word = text.group(0) + return urllib.parse.quote(chinese_word, encoding=encoding) + + return re.sub("([\u4e00-\u9fa5]+)", quote_chinese_word_func, text, flags=re.S) + + +def unescape(str): + """ + 反转译 + """ + return html.unescape(str) + + +def excape(str): + """ + 转译 + """ + return html.escape(str) + + +_regexs = {} + + +# @log_function_time +def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None): + regexs = isinstance(regexs, str) and [regexs] or regexs + + infos = [] + for regex in regexs: + if regex == "": + continue + + if regex not in _regexs.keys(): + _regexs[regex] = re.compile(regex, re.S) + + if fetch_one: + infos = _regexs[regex].search(html) + if infos: + infos = infos.groups() + else: + continue + else: + infos = _regexs[regex].findall(str(html)) + + if len(infos) > 0: + # print(regex) + break + + if fetch_one: + infos = infos if infos else ("",) + return infos if len(infos) > 1 else infos[0] + else: + infos = allow_repeat and infos or sorted(set(infos), key=infos.index) + infos = split.join(infos) if split else infos + return infos + + +def table_json(table, save_one_blank=True): + """ + 将表格转为json 适应于 key:value 在一行类的表格 + @param table: 使用selector封装后的具有xpath的selector + @param save_one_blank: 保留一个空白符 + @return: + """ + data = {} + + trs = table.xpath(".//tr") + for tr in trs: + tds = tr.xpath("./td|./th") + + for i in range(0, len(tds), 2): + if i + 1 > len(tds) - 1: + break + + key = tds[i].xpath("string(.)").extract_first(default="").strip() + value = tds[i + 1].xpath("string(.)").extract_first(default="").strip() + value = replace_str(value, "[\f\n\r\t\v]", "") + value = replace_str(value, " +", " " if save_one_blank else "") + + if key: + data[key] = value + + return data + + +def get_table_row_data(table): + """ + 获取表格里每一行数据 + @param table: 使用selector封装后的具有xpath的selector + @return: [[],[]..] + """ + + datas = [] + rows = table.xpath(".//tr") + for row in rows: + cols = row.xpath("./td|./th") + row_datas = [] + for col in cols: + data = col.xpath("string(.)").extract_first(default="").strip() + row_datas.append(data) + datas.append(row_datas) + + return datas + + +def rows2json(rows, keys=None): + """ + 将行数据转为json + @param rows: 每一行的数据 + @param keys: json的key,空时将rows的第一行作为key + @return: + """ + data_start_pos = 0 if keys else 1 + datas = [] + keys = keys or rows[0] + for values in rows[data_start_pos:]: + datas.append(dict(zip(keys, values))) + + return datas + + +def get_form_data(form): + """ + 提取form中提交的数据 + :param form: 使用selector封装后的具有xpath的selector + :return: + """ + data = {} + inputs = form.xpath(".//input") + for input in inputs: + name = input.xpath("./@name").extract_first() + value = input.xpath("./@value").extract_first() + if name: + data[name] = value + + return data + + +# mac上不好使 +# def get_domain(url): +# domain = '' +# try: +# domain = get_tld(url) +# except Exception as e: +# log.debug(e) +# return domain + + +def get_domain(url): + proto, rest = urllib.parse.splittype(url) + domain, rest = urllib.parse.splithost(rest) + return domain + + +def get_index_url(url): + return "/".join(url.split("/")[:3]) + + +def get_ip(domain): + ip = socket.getaddrinfo(domain, "http")[0][4][0] + return ip + + +def get_localhost_ip(): + """ + 利用 UDP 协议来实现的,生成一个UDP包,把自己的 IP 放如到 UDP 协议头中,然后从UDP包中获取本机的IP。 + 这个方法并不会真实的向外部发包,所以用抓包工具是看不到的 + :return: + """ + s = None + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("8.8.8.8", 80)) + ip = s.getsockname()[0] + finally: + if s: + s.close() + + return ip + + +def ip_to_num(ip): + import struct + + ip_num = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0]) + return ip_num + + +def is_valid_proxy(proxy, check_url=None): + """ + 检验代理是否有效 + @param proxy: xxx.xxx.xxx:xxx + @param check_url: 利用目标网站检查,目标网站url。默认为None, 使用代理服务器的socket检查, 但不能排除Connection closed by foreign host + @return: True / False + """ + is_valid = False + + if check_url: + proxies = {"http": f"http://{proxy}", "https": f"https://{proxy}"} + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" + } + response = None + try: + response = requests.get( + check_url, headers=headers, proxies=proxies, stream=True, timeout=20 + ) + is_valid = True + + except Exception as e: + log.error("check proxy failed: {} {}".format(e, proxy)) + + finally: + if response: + response.close() + + else: + ip, port = proxy.split(":") + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk: + sk.settimeout(7) + try: + sk.connect((ip, int(port))) # 检查代理服务器是否开着 + is_valid = True + + except Exception as e: + log.error("check proxy failed: {} {}:{}".format(e, ip, port)) + + return is_valid + + +def is_valid_url(url): + """ + 验证url是否合法 + :param url: + :return: + """ + if re.match(r"(^https?:/{2}\w.+$)|(ftp://)", url): + return True + else: + return False + + +def get_text(soup, *args): + try: + return soup.get_text() + except Exception as e: + log.error(e) + return "" + + +def del_html_tag(content, except_line_break=False, save_img=False, white_replaced=""): + """ + 删除html标签 + @param content: html内容 + @param except_line_break: 保留p标签 + @param save_img: 保留图片 + @param white_replaced: 空白符替换 + @return: + """ + content = replace_str(content, "(?i)") # (?)忽略大小写 + content = replace_str(content, "(?i)") + content = replace_str(content, "") + content = replace_str( + content, "(?!&[a-z]+=)&[a-z]+;?" + ) # 干掉 等无用的字符 但&xxx= 这种表示参数的除外 + if except_line_break: + content = content.replace("

", "/p") + content = replace_str(content, "<[^p].*?>") + content = content.replace("/p", "

") + content = replace_str(content, "[ \f\r\t\v]") + + elif save_img: + content = replace_str(content, "(?!)<.+?>") # 替换掉除图片外的其他标签 + content = replace_str(content, "(?! +)\s+", "\n") # 保留空格 + content = content.strip() + + else: + content = replace_str(content, "<(.|\n)*?>") + content = replace_str(content, "\s", white_replaced) + content = content.strip() + + return content + + +def del_html_js_css(content): + content = replace_str(content, "(?i)") # (?)忽略大小写 + content = replace_str(content, "(?i)") + content = replace_str(content, "") + + return content + + +def is_have_chinese(content): + regex = "[\u4e00-\u9fa5]+" + chinese_word = get_info(content, regex) + return chinese_word and True or False + + +def is_have_english(content): + regex = "[a-zA-Z]+" + english_words = get_info(content, regex) + return english_words and True or False + + +def get_chinese_word(content): + regex = "[\u4e00-\u9fa5]+" + chinese_word = get_info(content, regex) + return chinese_word + + +def get_english_words(content): + regex = "[a-zA-Z]+" + english_words = get_info(content, regex) + return english_words or "" + + +################################################## +def get_json(json_str): + """ + @summary: 取json对象 + --------- + @param json_str: json格式的字符串 + --------- + @result: 返回json对象 + """ + + try: + return json.loads(json_str) if json_str else {} + except Exception as e1: + try: + json_str = json_str.strip() + json_str = json_str.replace("'", '"') + keys = get_info(json_str, "(\w+):") + for key in keys: + json_str = json_str.replace(key, '"%s"' % key) + + return json.loads(json_str) if json_str else {} + + except Exception as e2: + log.error( + """ + e1: %s + format json_str: %s + e2: %s + """ + % (e1, json_str, e2) + ) + + return {} + + +def jsonp2json(jsonp): + """ + 将jsonp转为json + @param jsonp: jQuery172013600082560040794_1553230569815({}) + @return: + """ + try: + return json.loads(re.match(".*?({.*}).*", jsonp, re.S).group(1)) + except: + raise ValueError("Invalid Input") + + +def dumps_json(json_, indent=4, sort_keys=False): + """ + @summary: 格式化json 用于打印 + --------- + @param json_: json格式的字符串或json对象 + --------- + @result: 格式化后的字符串 + """ + try: + if isinstance(json_, str): + json_ = get_json(json_) + + json_ = json.dumps( + json_, ensure_ascii=False, indent=indent, skipkeys=True, sort_keys=sort_keys + ) + + except Exception as e: + log.error(e) + json_ = pformat(json_) + + return json_ + + +def get_json_value(json_object, key): + """ + @summary: + --------- + @param json_object: json对象或json格式的字符串 + @param key: 建值 如果在多个层级目录下 可写 key1.key2 如{'key1':{'key2':3}} + --------- + @result: 返回对应的值,如果没有,返回'' + """ + current_key = "" + value = "" + try: + json_object = ( + isinstance(json_object, str) and get_json(json_object) or json_object + ) + + current_key = key.split(".")[0] + value = json_object[current_key] + + key = key[key.find(".") + 1:] + except Exception as e: + return value + + if key == current_key: + return value + else: + return get_json_value(value, key) + + +def get_all_keys(datas, depth=None, current_depth=0): + """ + @summary: 获取json李所有的key + --------- + @param datas: dict / list + @param depth: 字典key的层级 默认不限制层级 层级从1开始 + @param current_depth: 字典key的当前层级 不用传参 + --------- + @result: 返回json所有的key + """ + + keys = [] + if depth and current_depth >= depth: + return keys + + if isinstance(datas, list): + for data in datas: + keys.extend(get_all_keys(data, depth, current_depth=current_depth + 1)) + elif isinstance(datas, dict): + for key, value in datas.items(): + keys.append(key) + if isinstance(value, dict): + keys.extend(get_all_keys(value, depth, current_depth=current_depth + 1)) + + return keys + + +def to_chinese(unicode_str): + format_str = json.loads('{"chinese":"%s"}' % unicode_str) + return format_str["chinese"] + + +################################################## +def replace_str(source_str, regex, replace_str=""): + """ + @summary: 替换字符串 + --------- + @param source_str: 原字符串 + @param regex: 正则 + @param replace_str: 用什么来替换 默认为'' + --------- + @result: 返回替换后的字符串 + """ + str_info = re.compile(regex) + return str_info.sub(replace_str, source_str) + + +def del_redundant_blank_character(text): + """ + 删除冗余的空白符, 只保留一个 + :param text: + :return: + """ + return re.sub("\s+", " ", text) + + +################################################## +def get_conf_value(config_file, section, key): + cp = configparser.ConfigParser(allow_no_value=True) + with codecs.open(config_file, "r", encoding="utf-8") as f: + cp.read_file(f) + return cp.get(section, key) + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + pass + + +def write_file(filename, content, mode="w", encoding="utf-8"): + """ + @summary: 写文件 + --------- + @param filename: 文件名(有路径) + @param content: 内容 + @param mode: 模式 w/w+ (覆盖/追加) + --------- + @result: + """ + + directory = os.path.dirname(filename) + mkdir(directory) + with open(filename, mode, encoding=encoding) as file: + file.writelines(content) + + +def read_file(filename, readlines=False, encoding="utf-8"): + """ + @summary: 读文件 + --------- + @param filename: 文件名(有路径) + @param readlines: 按行读取 (默认False) + --------- + @result: 按行读取返回List,否则返回字符串 + """ + + content = None + try: + with open(filename, "r", encoding=encoding) as file: + content = file.readlines() if readlines else file.read() + except Exception as e: + log.error(e) + + return content + + +def get_oss_file_list(oss_handler, prefix, date_range_min, date_range_max=None): + """ + 获取文件列表 + @param prefix: 路径前缀 如 data/car_service_line/yiche/yiche_serial_zongshu_info + @param date_range_min: 时间范围 最小值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00 + @param date_range_max: 时间范围 最大值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00 + @return: 每个文件路径 如 html/e_commerce_service_line/alibaba/alibaba_shop_info/2019/03/22/15/53/15/8ca8b9e4-4c77-11e9-9dee-acde48001122.json.snappy + """ + + # 计算时间范围 + date_range_max = date_range_max or date_range_min + date_format = "/".join( + ["%Y", "%m", "%d", "%H", "%M", "%S"][: date_range_min.count("/") + 1] + ) + time_interval = [ + {"days": 365}, + {"days": 31}, + {"days": 1}, + {"hours": 1}, + {"minutes": 1}, + {"seconds": 1}, + ][date_range_min.count("/")] + date_range = get_between_date( + date_range_min, date_range_max, date_format=date_format, **time_interval + ) + + for date in date_range: + file_folder_path = os.path.join(prefix, date) + objs = oss_handler.list(prefix=file_folder_path) + for obj in objs: + filename = obj.key + yield filename + + +def is_html(url): + if not url: + return False + + try: + content_type = request.urlopen(url).info().get("Content-Type", "") + + if "text/html" in content_type: + return True + else: + return False + except Exception as e: + log.error(e) + return False + + +def is_exist(file_path): + """ + @summary: 文件是否存在 + --------- + @param file_path: + --------- + @result: + """ + + return os.path.exists(file_path) + + +def download_file(url, base_path, filename="", call_func="", proxies=None, data=None): + file_path = base_path + filename + directory = os.path.dirname(file_path) + mkdir(directory) + + # 进度条 + def progress_callfunc(blocknum, blocksize, totalsize): + """回调函数 + @blocknum : 已经下载的数据块 + @blocksize : 数据块的大小 + @totalsize: 远程文件的大小 + """ + percent = 100.0 * blocknum * blocksize / totalsize + if percent > 100: + percent = 100 + # print ('进度条 %.2f%%' % percent, end = '\r') + sys.stdout.write("进度条 %.2f%%" % percent + "\r") + sys.stdout.flush() + + if url: + try: + log.debug( + """ + 正在下载 %s + 存储路径 %s + """ + % (url, file_path) + ) + if proxies: + # create the object, assign it to a variable + proxy = request.ProxyHandler(proxies) + # construct a new opener using your proxy settings + opener = request.build_opener(proxy) + # install the openen on the module-level + request.install_opener(opener) + + request.urlretrieve(url, file_path, progress_callfunc, data) + + log.debug( + """ + 下载完毕 %s + 文件路径 %s + """ + % (url, file_path) + ) + + call_func and call_func() + return 1 + except Exception as e: + log.error(e) + return 0 + else: + return 0 + + +def get_file_list(path, ignore=[]): + templist = path.split("*") + path = templist[0] + file_type = templist[1] if len(templist) >= 2 else "" + + # 递归遍历文件 + def get_file_list_(path, file_type, ignore, all_file=[]): + file_list = os.listdir(path) + + for file_name in file_list: + if file_name in ignore: + continue + + file_path = os.path.join(path, file_name) + if os.path.isdir(file_path): + get_file_list_(file_path, file_type, ignore, all_file) + else: + if not file_type or file_name.endswith(file_type): + all_file.append(file_path) + + return all_file + + return get_file_list_(path, file_type, ignore) if os.path.isdir(path) else [path] + + +def rename_file(old_name, new_name): + os.rename(old_name, new_name) + + +def del_file(path, ignore=()): + files = get_file_list(path, ignore) + for file in files: + try: + os.remove(file) + except Exception as e: + log.error( + """ + 删除出错: %s + Exception : %s + """ + % (file, str(e)) + ) + finally: + pass + + +def get_file_type(file_name): + """ + @summary: 取文件后缀名 + --------- + @param file_name: + --------- + @result: + """ + try: + return os.path.splitext(file_name)[1] + except Exception as e: + log.exception(e) + + +def get_file_path(file_path): + """ + @summary: 取文件路径 + --------- + @param file_path: /root/a.py + --------- + @result: /root + """ + try: + return os.path.split(file_path)[0] + except Exception as e: + log.exception(e) + + +############################################# + +# +# def exec_js(js_code): +# """ +# @summary: 执行js代码 +# --------- +# @param js_code: js代码 +# --------- +# @result: 返回执行结果 +# """ +# +# return execjs.eval(js_code) + + +# def compile_js(js_func): +# """ +# @summary: 编译js函数 +# --------- +# @param js_func:js函数 +# --------- +# @result: 返回函数对象 调用 fun('js_funName', param1,param2) +# """ +# +# ctx = execjs.compile(js_func) +# return ctx.call + + +############################################### + +############################################# + + +def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"): + """ + @summary: + --------- + @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳 + @param format:时间格式 + --------- + @result: 返回时间戳 + """ + + timestamp = time.mktime(time.strptime(date, time_format)) + return int(timestamp) + + +def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"): + """ + @summary: + --------- + @param timestamp: 将时间戳转化为日期 + @param format: 日期格式 + --------- + @result: 返回日期 + """ + if timestamp is None: + raise ValueError("timestamp is null") + + date = time.localtime(timestamp) + return time.strftime(time_format, date) + + +def get_current_timestamp(): + return int(time.time()) + + +def get_current_date(date_format="%Y-%m-%d %H:%M:%S"): + return datetime.datetime.now().strftime(date_format) + # return time.strftime(date_format, time.localtime(time.time())) + + +def get_date_number(year=None, month=None, day=None): + """ + @summary: 获取指定日期对应的日期数 + 默认当前周 + --------- + @param year: 2010 + @param month: 6 + @param day: 16 + --------- + @result: (年号,第几周,第几天) 如 (2010, 24, 3) + """ + if year and month and day: + return datetime.date(year, month, day).isocalendar() + elif not any([year, month, day]): + return datetime.datetime.now().isocalendar() + else: + assert year, "year 不能为空" + assert month, "month 不能为空" + assert day, "day 不能为空" + + +def get_between_date( + begin_date, end_date=None, date_format="%Y-%m-%d", **time_interval +): + """ + @summary: 获取一段时间间隔内的日期,默认为每一天 + --------- + @param begin_date: 开始日期 str 如 2018-10-01 + @param end_date: 默认为今日 + @param date_format: 日期格式,应与begin_date的日期格式相对应 + @param time_interval: 时间间隔 默认一天 支持 days、seconds、microseconds、milliseconds、minutes、hours、weeks + --------- + @result: list 值为字符串 + """ + + date_list = [] + + begin_date = datetime.datetime.strptime(begin_date, date_format) + end_date = ( + datetime.datetime.strptime(end_date, date_format) + if end_date + else datetime.datetime.strptime( + time.strftime(date_format, time.localtime(time.time())), date_format + ) + ) + time_interval = time_interval or dict(days=1) + + while begin_date <= end_date: + date_str = begin_date.strftime(date_format) + date_list.append(date_str) + + begin_date += datetime.timedelta(**time_interval) + + if end_date.strftime(date_format) not in date_list: + date_list.append(end_date.strftime(date_format)) + + return date_list + + +def get_between_months(begin_date, end_date=None): + """ + @summary: 获取一段时间间隔内的月份 + 需要满一整月 + --------- + @param begin_date: 开始时间 如 2018-01-01 + @param end_date: 默认当前时间 + --------- + @result: 列表 如 ['2018-01', '2018-02'] + """ + + def add_months(dt, months): + month = dt.month - 1 + months + year = dt.year + month // 12 + month = month % 12 + 1 + day = min(dt.day, calendar.monthrange(year, month)[1]) + return dt.replace(year=year, month=month, day=day) + + date_list = [] + begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d") + end_date = ( + datetime.datetime.strptime(end_date, "%Y-%m-%d") + if end_date + else datetime.datetime.strptime( + time.strftime("%Y-%m-%d", time.localtime(time.time())), "%Y-%m-%d" + ) + ) + while begin_date <= end_date: + date_str = begin_date.strftime("%Y-%m") + date_list.append(date_str) + begin_date = add_months(begin_date, 1) + return date_list + + +def get_today_of_day(day_offset=0): + return str(datetime.date.today() + datetime.timedelta(days=day_offset)) + + +def get_days_of_month(year, month): + """ + 返回天数 + """ + + return calendar.monthrange(year, month)[1] + + +def get_firstday_of_month(date): + """'' + date format = "YYYY-MM-DD" + """ + + year, month, day = date.split("-") + year, month, day = int(year), int(month), int(day) + + days = "01" + if int(month) < 10: + month = "0" + str(int(month)) + arr = (year, month, days) + return "-".join("%s" % i for i in arr) + + +def get_lastday_of_month(date): + """'' + get the last day of month + date format = "YYYY-MM-DD" + """ + year, month, day = date.split("-") + year, month, day = int(year), int(month), int(day) + + days = calendar.monthrange(year, month)[1] + month = add_zero(month) + arr = (year, month, days) + return "-".join("%s" % i for i in arr) + + +def get_firstday_month(month_offset=0): + """'' + get the first day of month from today + month_offset is how many months + """ + (y, m, d) = get_year_month_and_days(month_offset) + d = "01" + arr = (y, m, d) + return "-".join("%s" % i for i in arr) + + +def get_lastday_month(month_offset=0): + """'' + get the last day of month from today + month_offset is how many months + """ + return "-".join("%s" % i for i in get_year_month_and_days(month_offset)) + + +def get_last_month(month_offset=0): + """'' + get the last day of month from today + month_offset is how many months + """ + return "-".join("%s" % i for i in get_year_month_and_days(month_offset)[:2]) + + +def get_year_month_and_days(month_offset=0): + """ + @summary: + --------- + @param month_offset: 月份偏移量 + --------- + @result: ('2019', '04', '30') + """ + + today = datetime.datetime.now() + year, month = today.year, today.month + + this_year = int(year) + this_month = int(month) + total_month = this_month + month_offset + if month_offset >= 0: + if total_month <= 12: + days = str(get_days_of_month(this_year, total_month)) + total_month = add_zero(total_month) + return (year, total_month, days) + else: + i = total_month // 12 + j = total_month % 12 + if j == 0: + i -= 1 + j = 12 + this_year += i + days = str(get_days_of_month(this_year, j)) + j = add_zero(j) + return (str(this_year), str(j), days) + else: + if (total_month > 0) and (total_month < 12): + days = str(get_days_of_month(this_year, total_month)) + total_month = add_zero(total_month) + return (year, total_month, days) + else: + i = total_month // 12 + j = total_month % 12 + if j == 0: + i -= 1 + j = 12 + this_year += i + days = str(get_days_of_month(this_year, j)) + j = add_zero(j) + return (str(this_year), str(j), days) + + +def add_zero(n): + return "%02d" % n + + +def get_month(month_offset=0): + """'' + 获取当前日期前后N月的日期 + if month_offset>0, 获取当前日期前N月的日期 + if month_offset<0, 获取当前日期后N月的日期 + date format = "YYYY-MM-DD" + """ + today = datetime.datetime.now() + day = add_zero(today.day) + + (y, m, d) = get_year_month_and_days(month_offset) + arr = (y, m, d) + if int(day) < int(d): + arr = (y, m, day) + return "-".join("%s" % i for i in arr) + + +@run_safe_model("format_date") +def format_date(date, old_format="", new_format="%Y-%m-%d %H:%M:%S"): + """ + @summary: 格式化日期格式 + --------- + @param date: 日期 eg:2017年4月17日 3时27分12秒 + @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒' + %y 两位数的年份表示(00-99) + %Y 四位数的年份表示(000-9999) + %m 月份(01-12) + %d 月内中的一天(0-31) + %H 24小时制小时数(0-23) + %I 12小时制小时数(01-12) + %M 分钟数(00-59) + %S 秒(00-59) + @param new_format: 输出的日期格式 + --------- + @result: 格式化后的日期,类型为字符串 如2017-4-17 3:27:12 + """ + if not date: + return "" + + if not old_format: + regex = "(\d+)" + numbers = get_info(date, regex, allow_repeat=True) + formats = ["%Y", "%m", "%d", "%H", "%M", "%S"] + old_format = date + for i, number in enumerate(numbers[:6]): + if i == 0 and len(number) == 2: # 年份可能是两位 用小%y + old_format = old_format.replace( + number, formats[i].lower(), 1 + ) # 替换一次 '2017年11月30日 11:49' 防止替换11月时,替换11小时 + else: + old_format = old_format.replace(number, formats[i], 1) # 替换一次 + + try: + date_obj = datetime.datetime.strptime(date, old_format) + if "T" in date and "Z" in date: + date_obj += datetime.timedelta(hours=8) + date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S") + else: + date_str = datetime.datetime.strftime(date_obj, new_format) + + except Exception as e: + log.error("日期格式化出错,old_format = %s 不符合 %s 格式" % (old_format, date)) + date_str = date + + return date_str + + +@run_safe_model("format_time") +def format_time(release_time, date_format="%Y-%m-%d %H:%M:%S"): + if "年前" in release_time: + years = re.compile("(\d+)年前").findall(release_time) + years_ago = datetime.datetime.now() - datetime.timedelta( + days=int(years[0]) * 365 + ) + release_time = years_ago.strftime("%Y-%m-%d %H:%M:%S") + + elif "月前" in release_time: + months = re.compile("(\d+)月前").findall(release_time) + months_ago = datetime.datetime.now() - datetime.timedelta( + days=int(months[0]) * 30 + ) + release_time = months_ago.strftime("%Y-%m-%d %H:%M:%S") + + elif "周前" in release_time: + weeks = re.compile("(\d+)周前").findall(release_time) + weeks_ago = datetime.datetime.now() - datetime.timedelta(days=int(weeks[0]) * 7) + release_time = weeks_ago.strftime("%Y-%m-%d %H:%M:%S") + + elif "天前" in release_time: + ndays = re.compile("(\d+)天前").findall(release_time) + days_ago = datetime.datetime.now() - datetime.timedelta(days=int(ndays[0])) + release_time = days_ago.strftime("%Y-%m-%d %H:%M:%S") + + elif "小时前" in release_time: + nhours = re.compile("(\d+)小时前").findall(release_time) + hours_ago = datetime.datetime.now() - datetime.timedelta(hours=int(nhours[0])) + release_time = hours_ago.strftime("%Y-%m-%d %H:%M:%S") + + elif "分钟前" in release_time: + nminutes = re.compile("(\d+)分钟前").findall(release_time) + minutes_ago = datetime.datetime.now() - datetime.timedelta( + minutes=int(nminutes[0]) + ) + release_time = minutes_ago.strftime("%Y-%m-%d %H:%M:%S") + + elif "昨天" in release_time or "昨日" in release_time: + today = datetime.date.today() + yesterday = today - datetime.timedelta(days=1) + release_time = release_time.replace("昨天", str(yesterday)) + + elif "今天" in release_time: + release_time = release_time.replace("今天", get_current_date("%Y-%m-%d")) + + elif "刚刚" in release_time: + release_time = get_current_date() + + elif re.search("^\d\d:\d\d", release_time): + release_time = get_current_date("%Y-%m-%d") + " " + release_time + + elif not re.compile("\d{4}").findall(release_time): + month = re.compile("\d{1,2}").findall(release_time) + if month and int(month[0]) <= int(get_current_date("%m")): + release_time = get_current_date("%Y") + "-" + release_time + else: + release_time = str(int(get_current_date("%Y")) - 1) + "-" + release_time + + release_time = format_date(release_time, new_format=date_format) + + return release_time + + +def to_date(date_str, date_format="%Y-%m-%d %H:%M:%S"): + return datetime.datetime.strptime(date_str, date_format) + + +def get_before_date( + current_date, + days, + current_date_format="%Y-%m-%d %H:%M:%S", + return_date_format="%Y-%m-%d %H:%M:%S", +): + """ + @summary: 获取之前时间 + --------- + @param current_date: 当前时间 str类型 + @param days: 时间间隔 -1 表示前一天 1 表示后一天 + @param days: 返回的时间格式 + --------- + @result: 字符串 + """ + + current_date = to_date(current_date, current_date_format) + date_obj = current_date + datetime.timedelta(days=days) + return datetime.datetime.strftime(date_obj, return_date_format) + + +def delay_time(sleep_time=160): + """ + @summary: 睡眠 默认1分钟 + --------- + @param sleep_time: 以秒为单位 + --------- + @result: + """ + + time.sleep(sleep_time) + + +def format_seconds(seconds): + """ + @summary: 将秒转为时分秒 + --------- + @param seconds: + --------- + @result: 2天3小时2分49秒 + """ + + seconds = int(seconds + 0.5) # 向上取整 + + m, s = divmod(seconds, 60) + h, m = divmod(m, 60) + d, h = divmod(h, 24) + + times = "" + if d: + times += "{}天".format(d) + if h: + times += "{}小时".format(h) + if m: + times += "{}分".format(m) + if s: + times += "{}秒".format(s) + + return times + + +################################################ +def get_md5(*args): + """ + @summary: 获取唯一的32位md5 + --------- + @param *args: 参与联合去重的值 + --------- + @result: 7c8684bcbdfcea6697650aa53d7b1405 + """ + + m = hashlib.md5() + for arg in args: + m.update(str(arg).encode()) + + return m.hexdigest() + + +def get_sha1(*args): + """ + @summary: 获取唯一的40位值, 用于获取唯一的id + --------- + @param *args: 参与联合去重的值 + --------- + @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e + """ + + sha1 = hashlib.sha1() + for arg in args: + sha1.update(str(arg).encode()) + return sha1.hexdigest() # 40位 + + +def get_base64(secret, message): + """ + @summary: 数字证书签名算法是:"HMAC-SHA256" + 参考:https://www.jokecamp.com/blog/examples-of-creating-base64-hashes-using-hmac-sha256-in-different-languages/ + --------- + @param secret: 秘钥 + @param message: 消息 + --------- + @result: 签名输出类型是:"base64" + """ + + import hashlib + import hmac + import base64 + + message = bytes(message, "utf-8") + secret = bytes(secret, "utf-8") + + signature = base64.b64encode( + hmac.new(secret, message, digestmod=hashlib.sha256).digest() + ).decode("utf8") + return signature + + +def get_uuid(key1="", key2=""): + """ + @summary: 计算uuid值 + 可用于将两个字符串组成唯一的值。如可将域名和新闻标题组成uuid,形成联合索引 + --------- + @param key1:str + @param key2:str + --------- + @result: + """ + + uuid_object = "" + + if not key1 and not key2: + uuid_object = uuid.uuid1() + else: + hash = md5(bytes(key1, "utf-8") + bytes(key2, "utf-8")).digest() + uuid_object = uuid.UUID(bytes=hash[:16], version=3) + + return str(uuid_object) + + +def get_hash(text): + return hash(text) + + +################################################## + + +def cut_string(text, length): + """ + @summary: 将文本按指定长度拆分 + --------- + @param text: 文本 + @param length: 拆分长度 + --------- + @result: 返回按指定长度拆分后形成的list + """ + + text_list = re.findall(".{%d}" % length, text, re.S) + leave_text = text[len(text_list) * length:] + if leave_text: + text_list.append(leave_text) + + return text_list + + +def get_random_string(length=1): + random_string = "".join(random.sample(string.ascii_letters + string.digits, length)) + return random_string + + +def get_random_password(length=8, special_characters=""): + """ + @summary: 创建随机密码 默认长度为8,包含大写字母、小写字母、数字 + --------- + @param length: 密码长度 默认8 + @param special_characters: 特殊字符 + --------- + @result: 指定长度的密码 + """ + + while True: + random_password = "".join( + random.sample( + string.ascii_letters + string.digits + special_characters, length + ) + ) + if ( + re.search("[0-9]", random_password) + and re.search("[A-Z]", random_password) + and re.search("[a-z]", random_password) + ): + if not special_characters: + break + elif set(random_password).intersection(special_characters): + break + + return random_password + + +def get_random_email(length=None, email_types: list = None, special_characters=""): + """ + 随机生成邮箱 + :param length: 邮箱长度 + :param email_types: 邮箱类型 + :param special_characters: 特殊字符 + :return: + """ + if not length: + length = random.randint(4, 12) + if not email_types: + email_types = [ + "qq.com", + "163.com", + "gmail.com", + "yahoo.com", + "hotmail.com", + "yeah.net", + "126.com", + "139.com", + "sohu.com", + ] + + email_body = get_random_password(length, special_characters) + email_type = random.choice(email_types) + + email = email_body + "@" + email_type + return email + + +################################# + + +def dumps_obj(obj): + return pickle.dumps(obj) + + +def loads_obj(obj_str): + return pickle.loads(obj_str) + + +def get_method(obj, name): + name = str(name) + try: + return getattr(obj, name) + except AttributeError: + log.error("Method %r not found in: %s" % (name, obj)) + return None + + +def witch_workspace(project_path): + """ + @summary: + --------- + @param project_path: + --------- + @result: + """ + + os.chdir(project_path) # 切换工作路经 + + +############### 数据库相关 ####################### +def format_sql_value(value): + if isinstance(value, str): + value = value.strip() + + elif isinstance(value, (list, dict)): + value = dumps_json(value, indent=None) + + elif isinstance(value, (datetime.date, datetime.time)): + value = str(value) + + elif isinstance(value, bool): + value = int(value) + + return value + + +def list2str(datas): + """ + 列表转字符串 + :param datas: [1, 2] + :return: (1, 2) + """ + data_str = str(tuple(datas)) + data_str = re.sub(",\)$", ")", data_str) + return data_str + + +def make_insert_sql( + table, data, auto_update=False, update_columns=(), insert_ignore=False +): + """ + @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO) + --------- + @param table: + @param data: 表数据 json格式 + @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据 + @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列 + @param insert_ignore: 数据存在忽略 + --------- + @result: + """ + + keys = ["`{}`".format(key) for key in data.keys()] + keys = list2str(keys).replace("'", "") + + values = [format_sql_value(value) for value in data.values()] + values = list2str(values) + + if update_columns: + if not isinstance(update_columns, (tuple, list)): + update_columns = [update_columns] + update_columns_ = ", ".join( + ["{key}=values({key})".format(key=key) for key in update_columns] + ) + sql = ( + "insert%s into {table} {keys} values {values} on duplicate key update %s" + % (" ignore" if insert_ignore else "", update_columns_) + ) + + elif auto_update: + sql = "replace into {table} {keys} values {values}" + else: + sql = "insert%s into {table} {keys} values {values}" % ( + " ignore" if insert_ignore else "" + ) + + sql = sql.format(table=table, keys=keys, values=values).replace("None", "null") + return sql + + +def make_update_sql(table, data, condition): + """ + @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO) + --------- + @param table: + @param data: 表数据 json格式 + @param condition: where 条件 + --------- + @result: + """ + key_values = [] + + for key, value in data.items(): + value = format_sql_value(value) + if isinstance(value, str): + key_values.append("`{}`='{}'".format(key, value)) + elif value is None: + key_values.append("`{}`={}".format(key, "null")) + else: + key_values.append("`{}`={}".format(key, value)) + + key_values = ", ".join(key_values) + + sql = "update {table} set {key_values} where {condition}" + sql = sql.format(table=table, key_values=key_values, condition=condition) + return sql + + +def make_batch_sql( + table, datas, auto_update=False, update_columns=(), update_columns_value=() +): + """ + @summary: 生产批量的sql + --------- + @param table: + @param datas: 表数据 [{...}] + @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据 + @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列 + @param update_columns_value: 需要更新的列的值 默认为datas里边对应的值, 注意 如果值为字符串类型 需要主动加单引号, 如 update_columns_value=("'test'",) + --------- + @result: + """ + if not datas: + return + + keys = list(datas[0].keys()) + values_placeholder = ["%s"] * len(keys) + + values = [] + for data in datas: + value = [] + for key in keys: + current_data = data.get(key) + current_data = format_sql_value(current_data) + + value.append(current_data) + + values.append(value) + + keys = ["`{}`".format(key) for key in keys] + keys = list2str(keys).replace("'", "") + + values_placeholder = list2str(values_placeholder).replace("'", "") + + if update_columns: + if not isinstance(update_columns, (tuple, list)): + update_columns = [update_columns] + if update_columns_value: + update_columns_ = ", ".join( + [ + "`{key}`={value}".format(key=key, value=value) + for key, value in zip(update_columns, update_columns_value) + ] + ) + else: + update_columns_ = ", ".join( + ["`{key}`=values(`{key}`)".format(key=key) for key in update_columns] + ) + sql = "insert into {table} {keys} values {values_placeholder} on duplicate key update {update_columns}".format( + table=table, + keys=keys, + values_placeholder=values_placeholder, + update_columns=update_columns_, + ) + elif auto_update: + sql = "replace into {table} {keys} values {values_placeholder}".format( + table=table, keys=keys, values_placeholder=values_placeholder + ) + else: + sql = "insert ignore into {table} {keys} values {values_placeholder}".format( + table=table, keys=keys, values_placeholder=values_placeholder + ) + + return sql, values + + +############### json相关 ####################### + + +def key2underline(key): + regex = "[A-Z]*" + capitals = re.findall(regex, key) + + if capitals: + for pos, capital in enumerate(capitals): + if not capital: + continue + if pos == 0: + if len(capital) > 1: + key = key.replace(capital, capital.lower() + "_", 1) + else: + key = key.replace(capital, capital.lower(), 1) + else: + if len(capital) > 1: + key = key.replace(capital, "_" + capital.lower() + "_", 1) + else: + key = key.replace(capital, "_" + capital.lower(), 1) + + return key.strip("_") + + +def key2hump(key): + """ + 下划线试变成首字母大写 + """ + return key.title().replace("_", "") + + +def format_json_key(json_data): + json_data_correct = {} + for key, value in json_data.items(): + key = key2underline(key) + json_data_correct[key] = value + + return json_data_correct + + +def quick_to_json(text): + """ + @summary: 可快速将浏览器上的header转为json格式 + --------- + @param text: + --------- + @result: + """ + + contents = text.split("\n") + json = {} + for content in contents: + if content == "\n": + continue + + content = content.strip() + regex = ["(:?.*?):(.*)", "(.*?):? +(.*)", "([^:]*)"] + + result = get_info(content, regex) + result = result[0] if isinstance(result[0], tuple) else result + try: + json[result[0]] = eval(result[1].strip()) + except: + json[result[0]] = result[1].strip() + + return json + + +############################## + + +def print_pretty(object): + pprint(object) + + +def print_params2json(url): + params_json = {} + params = url.split("?")[-1].split("&") + for param in params: + key_value = param.split("=", 1) + params_json[key_value[0]] = key_value[1] + + print(dumps_json(params_json)) + + +def print_cookie2json(cookie_str_or_list): + if isinstance(cookie_str_or_list, str): + cookie_json = {} + cookies = cookie_str_or_list.split("; ") + for cookie in cookies: + name, value = cookie.split("=") + cookie_json[name] = value + else: + cookie_json = get_cookies_from_selenium_cookie(cookie_str_or_list) + + print(dumps_json(cookie_json)) + + +############################### + + +def flatten(x): + """flatten(sequence) -> list + Returns a single, flat list which contains all elements retrieved + from the sequence and all recursively contained sub-sequences + (iterables). + Examples: + >>> [1, 2, [3,4], (5,6)] + [1, 2, [3, 4], (5, 6)] + >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)]) + [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10] + >>> flatten(["foo", "bar"]) + ['foo', 'bar'] + >>> flatten(["foo", ["baz", 42], "bar"]) + ['foo', 'baz', 42, 'bar'] + """ + return list(iflatten(x)) + + +def iflatten(x): + """iflatten(sequence) -> iterator + Similar to ``.flatten()``, but returns iterator instead""" + for el in x: + if _is_listlike(el): + for el_ in flatten(el): + yield el_ + else: + yield el + + +def _is_listlike(x): + """ + >>> _is_listlike("foo") + False + >>> _is_listlike(5) + False + >>> _is_listlike(b"foo") + False + >>> _is_listlike([b"foo"]) + True + >>> _is_listlike((b"foo",)) + True + >>> _is_listlike({}) + True + >>> _is_listlike(set()) + True + >>> _is_listlike((x for x in range(3))) + True + >>> _is_listlike(six.moves.xrange(5)) + True + """ + return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes)) + + +################### + + +def re_def_supper_class(obj, supper_class): + """ + 重新定义父类 + @param obj: 类 如 class A: 则obj为A 或者 A的实例 a.__class__ + @param supper_class: 父类 + @return: + """ + obj.__bases__ = (supper_class,) + + diff --git a/test/uni_test.py b/test/uni_test.py index 6e57b02..b960c26 100644 --- a/test/uni_test.py +++ b/test/uni_test.py @@ -13,7 +13,7 @@ from smart.item import Item from smart.request import Request from smart.response import Response -from smart.tool import is_valid_url +from smart.tool import is_valid_url, mutations_bkdr_hash class TestItem(Item): @@ -25,6 +25,9 @@ def clean_age(self, value): return value + + + class TestClassOne(object): def test_1(self): request = Request("http://www.ocpe.com.cn/nengyuanjingji/zf/2020-08-29/3785.html") @@ -43,3 +46,12 @@ def test_2(self): def test3(self): print(is_valid_url("http://www.baidu.com")) + + def test4(self, *args): + for i in range(100): + print(hash("http://exercise.kingname.info/exercise_middleware_ip/1:0")) + + def test5(self): + print("\r\n") + x = "22李思32sss;'*&^%%$##@#@!是S2.。 s s" * 100000 + print(mutations_bkdr_hash(x))