-
Notifications
You must be signed in to change notification settings - Fork 1
/
doubanBook.py
54 lines (46 loc) · 1.27 KB
/
doubanBook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# coding=utf-8
import string
import re
import urllib2
class DoubanSpider(object):
'''
爬取豆瓣图书top250
'''
def __init__(self):
self.pageNum = 1
self.cur_url = "http://book.douban.com/top250?start={pageNum}&filter=&type="
self.datas = []
self._top_num = 1
print "豆瓣图书Top250爬虫准备爬取数据。。。"
def get_html(self, cur_page):
url = self.cur_url
try:
html = urllib2.urlopen(url.format(pageNum=(cur_page-1)*25)).read().decode('utf-8')
except urllib2.URLError, e:
if hasattr(e, "code"):
print "Error code: %s" % e.code
elif hasattr(e, 'reason'):
print "Reason: %s" % e.reason
return html
def find_title(self, html):
temp_data = []
book_items = re.findall(r'<div class="pl2">.*?title=(.*?)>', html, re.S)
for index, item in enumerate(book_items):
if item.find(" ") == -1:
temp_data.append("Top" + str(self._top_num) + " " + item)
self._top_num += 1
self.datas.extend(temp_data)
def start_spider(self):
while self.pageNum <= 10:
html = self.get_html(self.pageNum)
self.find_title(html)
self.pageNum += 1
def main():
print "豆瓣图书爬虫"
my_spider = DoubanSpider()
my_spider.start_spider()
for item in my_spider.datas:
print item
print "over"
if __name__ == '__main__':
main()