-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetDetails.py
102 lines (81 loc) · 2.9 KB
/
getDetails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# coding:utf8
# 根据电影id获取详细信息
import urllib2
import urllib
import json
import time
import random
from bs4 import BeautifulSoup
inputFile = 'douban_movie.txt'
fr = open(inputFile, 'r')
outputFile = 'douban_movie_detail.txt'
fw = open(outputFile, 'w')
fw.write('id^title^url^cover^rate^director^composer^actor^category^district^language^showtime^length^othername^description\n')
headers = {}
# headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
# headers["Accept-Encoding"] = "gzip, deflate, sdch"
# headers["Accept-Language"] = "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4,ja;q=0.2"
# headers["Cache-Control"] = "max-age=0"
# headers["Connection"] = "keep-alive"
# headers["Cookie"] = 'asdasd13323efafasfa'
# headers["Host"] = "movie.douban.com"
# headers["Referer"] = "http://movie.douban.com/"
# headers["Upgrade-Insecure-Requests"] = 1
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
firstLine = True
count = 1
errorCount = 0
result = {}
for line in fr:
if firstLine:
firstLine = False
continue
line = line.split(';')
movieId = line[0]
title = line[1]
url = line[2]
cover = line[3]
rate = line[4].rstrip('\n')
if result.has_key(movieId):
continue
else:
result[str(movieId)] = 1
try:
request = urllib2.Request(url=url,headers=headers)
# request = urllib2.Request(url=url)
response = urllib2.urlopen(request)
html = response.read()
html = BeautifulSoup(html)
info = html.select('#info')[0]
info = info.get_text().split('\n')
# 提取字段,只要冒号后面的文本内容
director = info[1].split(':')[-1].strip()
composer = info[2].split(':')[-1].strip()
actor = info[3].split(':')[-1].strip()
category = info[4].split(':')[-1].strip()
district = info[6].split(':')[-1].strip()
language = info[7].split(':')[-1].strip()
showtime = info[8].split(':')[-1].strip()
length = info[9].split(':')[-1].strip()
othername = info[10].split(':')[-1].strip()
# 电影简介
description = html.find_all("span", attrs={"property": "v:summary"})[0].get_text()
description = description.lstrip().lstrip('\n\t').rstrip().rstrip('\n\t').replace('\n','\t')
# 写入数据
record = str(movieId) + '^' + title + '^' + url + '^' + cover + '^' + str(rate) + '^' + director.encode('utf8') + '^' + composer.encode('utf8') + '^' + actor.encode('utf8') + '^' + category.encode('utf8') + '^' + district.encode('utf8') + '^' + language.encode('utf8') + '^' + showtime.encode('utf8') + '^' + length.encode('utf8') + '^' + othername.encode('utf8') + '^' + description.encode('utf8') + '\n'
fw.write(record)
print count,title
# time.sleep(5)
except Exception, e:
print e
print count,title,"Error"
errorCount = errorCount + 1
else:
pass
finally:
pass
count = count + 1
print count, errorCount
fr.close()
fw.close()