forked from panacena/mmPictures
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mm.py
144 lines (115 loc) · 4.49 KB
/
mm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
# -*-coding:utf-8-*-
import urllib2
from lxml import etree
from os import system
"""
第一步: 从 http://www.zngirls.com/rank/sum/ 开始抓取MM点击头像的链接(注意是分页的)
#第二部 http://www.zngirls.com/girl/21751/ 抓取每一个写真集合的链接(注意是分页的)
#第三部 http://www.zngirls.com/g/19671/1.html 在写真图片的具体页面抓取图片(注意是分页的)
"""
pciturelist=[]
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
, "Connection": "keep-alive"
}
"""
从起始页面 http://www.zngirls.com/rank/sum/ 开始获取排名的页数和每一页的url
"""
def mmRankSum():
req = urllib2.Request("http://www.zngirls.com/rank/sum/", headers=header)
html = urllib2.urlopen(req)
htmldata = html.read()
htmlpath = etree.HTML(htmldata)
#首先获取页码数,然后用循环的方式挨个解析每一个页面
pages = htmlpath.xpath('//div[@class="pagesYY"]/div/a/@href')
for i in range( len(pages) -2 ):
pagesitem="http://www.zngirls.com/rank/sum/"+ pages[i]
mmRankitem(pagesitem)
"""
参数 url : 分页中每一页的具体url地址
通过穿过来的参数,使用 lxml和xpath 解析 html,获取每一个MM写真专辑页面的url
"""
def mmRankitem(url):
req = urllib2.Request(url, headers=header)
html = urllib2.urlopen(req)
htmldata = html.read()
htmlpath = etree.HTML(htmldata)
pages = htmlpath.xpath('//div[@class="rankli_imgdiv"]/a/@href')
for i in range(len(pages)):
print "http://www.zngirls.com/" + pages[i]+"album/"
getAlbums("http://www.zngirls.com/" + pages[i]+"/album/")
#print "http://www.zngirls.com/" + pages[i]
"""
参数 url : 每一个MM专辑的页面地址
通过穿过来的参数,获取每一个MM写真专辑图片集合的地址
"""
def getAlbums(girlUrl):
req = urllib2.Request(girlUrl, headers=header)
html = urllib2.urlopen(req)
htmldata = html.read()
htmlpath = etree.HTML(htmldata)
pages = htmlpath.xpath('//div[@class="igalleryli_div"]/a/@href')
for i in range(len(pages)):
getPagePicturess("http://www.zngirls.com/" + pages[i])
"""
参数 url : 每一个MM写真专辑图片集合的地址
通过穿过来的参数,首先先获取图片集合的页数,然后每一页解析写真图片的真实地址
"""
def getPagePicturess(albumsurl):
req = urllib2.Request(albumsurl, headers=header)
html = urllib2.urlopen(req)
htmldata = html.read()
htmlpath = etree.HTML(htmldata)
pages = htmlpath.xpath('//div[@id="pages"]/a/@href')
for i in range(len(pages)-2):
savePictures("http://www.zngirls.com" + pages[i])
"""
参数 url : 每一个MM写真专辑图片集合的地址(进过分页检测)
通过穿过来的参数,直接解析页面,获取写真图片的地址,然后下载保存到本地。
"""
def savePictures(itemPagesurl):
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
, "Connection": "keep-alive"
, "Referer": "image / webp, image / *, * / *;q = 0.8"
,"Accept":"image/webp,image/*,*/*;q=0.8"
}
try:
req = urllib2.Request(itemPagesurl, headers=header)
html = urllib2.urlopen(req)
htmldata = html.read()
htmlpath = etree.HTML(htmldata)
print itemPagesurl
pages = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@src')
names = htmlpath.xpath('//div[@class="gallery_wrapper"]/ul/img/@alt')
except Exception:
pass
for i in range(len(pages) ):
print pages[i]
pciturelist.append(pages[i])
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
, "Connection": "keep-alive"
, "Referer": pages[i]
}
req = urllib2.Request(pages[i], headers=headers)
urlhtml = urllib2.urlopen(req)
respHtml = urlhtml.read()
binfile = open('%s.jpg' % ( names[i] ) , "wb")
binfile.write(respHtml);
binfile.close();
except Exception :
pass
mmRankSum()
"""
fl=open('list.txt', 'w')
for i in pciturelist:
fl.write(i)
fl.write("\n")
fl.close()
print '关机ing'
"""
print 'finish'
system('shutdown -s')