-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
85 lines (73 loc) · 2.83 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import time
import openpyxl
from util import util
from crawler.scraping import scraping
welcome = '''
//////////////////////////////////////
// 知网论文爬取工具 //
// 2017-5-1 //
// @author firejq //
//////////////////////////////////////
程序正在初始化……
'''
print(welcome)
driver = util.getDriver('phantomjs')
url = 'http://kns.cnki.net/kns/brief/result.aspx?dbprefix=scdb'
print('请输入论文发表时间范围(若无时间范围限制,直接回车即可):')
begin_time = util.get_time_input('开始时间(1979-01-01 ~ 2017-12-31):')
end_time = util.get_time_input('截止时间(1979-01-01 ~ 2017-12-31):')
authors = util.get_name_list()
path = os.getcwd() + os.sep + 'result'
filename = 'result.xlsx'
out_path = os.path.join(path, filename)
if not os.path.exists(out_path):
# 程序未运行过的情况
if not os.path.exists(path):
os.mkdir(path)
wb = openpyxl.Workbook()
ws = wb.create_sheet(index=1, title='tmp')
# 初始化
for author in authors:
tmp = [author, 'unsolved']
line = [l for l in tmp]
ws.append(line)
wb.save(out_path)
for author in authors:
driver.get(url=url)
time.sleep(1)
scraping(driver=driver, author_name=author, author_company='深圳大学', begin_time=begin_time, end_time=end_time)
else:
# 程序已经运行过
wb = openpyxl.load_workbook(out_path)
sheet_names = wb.get_sheet_names()
if 'tmp' in sheet_names:
# 程序上次运行到一半意外退出了
ws = wb.get_sheet_by_name('tmp')
for i in range(1, ws.max_row + 1):
if ws['B' + str(i)].value == 'solved':
print('【' + ws['A' + str(i)].value + '】的信息已经抓取过,进行下一个作者的信息抓取')
continue
if ws['B' + str(i)].value == 'unsolved':
driver.get(url=url)
time.sleep(1)
scraping(driver=driver, author_name=ws['A' + str(i)].value, author_company='深圳大学',
begin_time=begin_time, end_time=end_time)
else:
# 程序上次全程运行完毕后才正确退出
ws = wb.create_sheet(index=1, title='tmp')
# 初始化
for author in authors:
tmp = [author, 'unsolved']
line = [l for l in tmp]
ws.append(line)
wb.save(out_path)
for author in authors:
driver.get(url=url)
time.sleep(1)
scraping(driver=driver, author_name=author, author_company='深圳大学', begin_time=begin_time, end_time=end_time)
util.delete_sheet_tmp()
driver.quit()
print('名单上所有作者的论文信息抓取完毕')
# TODO 去重
# TODO 提高性能--多线程 & 多进程 & 异步IO