-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy pathstockInfo.py
189 lines (147 loc) · 6.19 KB
/
stockInfo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# working v1.0
__author__ = 'Rocky'
'''
http://30daydo.com
Contact: [email protected]
'''
import json
import datetime
import time
import codecs
import os, sys
import requests
import re
from scrapy.selector import Selector
from elasticsearch import Elasticsearch
from configure.settings import llogger
logger = llogger('log/stockinfo.log')
my_useragent = [
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'
]
es = Elasticsearch('10.18.6.102:9200')
def create_tb(conn):
cur = conn.cursor()
cmd = '''CREATE TABLE IF NOT EXISTS tb_cnstock(Date DATETIME ,Title VARCHAR (800),URL VARCHAR (100),PRIMARY KEY (URL)) charset=utf8;'''
try:
cur.execute(cmd)
conn.commit()
return True
except Exception as e:
logger.info(e)
conn.rollback()
return False
def getinfo(days=-30):
last_day = datetime.datetime.now() + datetime.timedelta(days=days)
url = "http://app.cnstock.com/api/waterfall?callback=jQuery19107348148582372209_1557710326005&colunm=qmt-tjd_ggkx&page={}&num=20&showstock=0"
page = 1
temp_time = time.strftime("[%Y-%m-%d]-[%H-%M]", time.localtime())
store_filename = "StockNews-%s.log" % temp_time
f_open = codecs.open(store_filename, 'w', 'utf-8')
db_name = 'db_stock'
conn = get_mysql_conn(db_name, local='local')
cur = conn.cursor()
run_flag = True
while run_flag:
headers = {'Referer': 'http://ggjd.cnstock.com/company/scp_ggjd/tjd_ggkx',
'User-Agent': 'Mozilla/5.0 (Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', }
retry = 3
response = None
for _ in range(retry):
try:
response = requests.get(url=url.format(page), headers=headers)
response.encoding = 'utf8'
except Exception as e:
if hasattr(e, 'code'):
logger.info("error code %d" % e.code)
elif hasattr(e, 'reason'):
logger.info("error reason %s " % e.reason)
time.sleep(5)
else:
if response.status_code == 200:
break
try:
text = response.text.encode('utf8').decode('unicode_escape')
js = re.search('jQuery19107348148582372209_1557710326005\((.*?)\)$', text, re.S).group(1)
js = re.sub('\r\n', '', js)
js_data = json.loads(js)
except Exception as e:
logger.error(e)
return None
content = js_data.get('data', {}).get('item', {})
if content is None:
continue
for item in content:
title = item.get('title')
if '晚间重要公告集锦' in title or '停复牌汇总' in title:
continue
link = item.get('link')
link = link.replace('\\', '')
pubdate_t = item.get('time')
pubdate_dtype = datetime.datetime.strptime(pubdate_t, '%Y-%m-%d %H:%M:%S')
if pubdate_dtype < last_day:
run_flag = False
keyword = item.get('keyword')
if keyword:
keyword = ' '.join(keyword)
sub_content = None
for i in range(2):
try:
sub_content = requests.get(url=link, headers=headers)
except Exception as e:
logger.error(e)
continue
# 重试
else:
if sub_content.status_code == 200:
break
root = Selector(text=sub_content.text)
detail_content = root.xpath('//div[@id="qmt_content_div"]')[0].xpath('string(.)').extract_first()
if detail_content:
detail_content = detail_content.strip()
temp_tuple = (pubdate_dtype, title, link, detail_content, keyword)
insert_sql = 'insert into tb_cnstock (Date,Title,URL,Content,keyword) values (%s,%s,%s,%s,%s)'
# es
try:
pubdate_dtype=pubdate_dtype.strftime("%Y-%m-%d"'T'"%H:%M:%S")
body = {'Title': title, 'ULR': link, 'keyword': keyword, 'content': detail_content, 'Date': pubdate_dtype}
es.index(index='cnstock',doc_type='doc',body=body)
except Exception as e:
logger.error(e)
# mysql
try:
cur.execute(insert_sql, temp_tuple)
conn.commit()
except Exception as e:
logger.error(e)
conn.rollback()
file_content = '{} ---- {}\n{}\n\n'.format(pubdate_t, title, link)
f_open.write(file_content)
page += 1
f_open.close()
if __name__ == "__main__":
sub_folder = DATA_PATH
if not os.path.exists(sub_folder):
os.mkdir(sub_folder)
os.chdir(sub_folder)
if len(sys.argv) > 1:
if re.match('-\d+', sys.argv[1]):
day = int(sys.argv[1])
else:
day = -3
else:
day = -3
getinfo(days=day)
# fetch_detail()