-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap.py
142 lines (123 loc) · 4.78 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.compat import xmlrpc_client
from wordpress_xmlrpc.methods import media, posts
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
import re, urllib, os, sys, time, mimetypes, xmlrpclib, errno, magic
driver = webdriver.Chrome()
# Add the URL of the product page you want to scrap
driver.get("https://www.test.com/example-category")
driver.implicitly_wait(10)
# Add your Site URL and Details
client = Client('http://wp-ep.test/xmlrpc.php', 'admin', 'secret')
existingPostTitles = []
# All the magic runs in this function and its loops
def getProductTitle():
productsJs=[]
# Target by xpath the box that has the products elements in it
productlist = driver.find_element_by_xpath('/html/body/div/div[3]/div/div/section/div/section/ul')
productsJs = productlist.find_elements_by_tag_name('article')
print 'Found ' + str(len(productsJs)) + ' Products'
description = ''
# We Fetch all current products and return the existingPostTitles for comparison before import
getAllPosts()
# Iterating throught the products in the box
for p in productsJs:
# Target the title & url of one product
prod_title = p.find_element_by_xpath('div/div/header/h3/a').text
rurl = p.find_element_by_xpath('div/div/header/h3/a').get_attribute('href')
print 'Checking if ' + prod_title + ' exists'
# Checking Scraped Title against existing in our WordPress and if exists we move to next product
post_id=find_id(prod_title, existingPostTitles)
if post_id == True:
print "Sorry, we already have such a post():"
print '\n'
continue
# Open Product URL to get Description and Image link
print 'Found Product URL: ' + rurl
openProductWindow(rurl)
description = getProductDescription()
print 'Found Product Description: ' + description
img_url = getImageLink()
print 'Found Image Url: ' + img_url
print 'Downloading Image'
print 'Zzz 3 secs.. Uploading..'
time.sleep(3)
attachment_id = imageUpload(img_url)
makeWpPost(prod_title, description, attachment_id)
driver.close()
time.sleep(5)
driver.switch_to.window(driver.window_handles[0])
# Gets All Existing WP Products
def getAllPosts():
offset = 0
increment = 10
while True:
filter = { 'post_type': 'product', 'offset' : offset }
postz = client.call(GetPosts(filter))
if len(postz) == 0:
break # no more posts returned
for post in postz:
existingPostTitles.append(post.title)
offset = offset + increment
# Check Scrapped product title agains Existing Product Titles
def find_id(prod_title, existingPostTitles):
while True:
for post in existingPostTitles:
if post == prod_title:
print post
return(True)
return(False)
# Open Product in new tab and switch to it
def openProductWindow(rurl):
driver.execute_script('window.open("'+ rurl +'","_blank");')
driver.switch_to_window(driver.window_handles[-1])
# Get Product Description
def getProductDescription():
driver.implicitly_wait(5)
# Target Product Description from single product page
description = driver.find_element_by_xpath('/html/body/div[1]/div[3]/div/div/div/div/div/div[2]/ul/li[2]').text
return description
# Get Product Image Link
def getImageLink():
# Target Image with XPATH
img_url = driver.find_element_by_xpath('//*[@id="cloudZoom"]').get_attribute("href")
return img_url
# Download Image and prepare for upload
def imageUpload(img_url):
cleanImgUrl = img_url.split('?')[0]
imageName = img_url.split('/')[-1]
cleanImgName = imageName.split('?')[0]
# Cloudflare proxy may get in the way without a useragent
urllib.URLopener.version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
fileImg = urllib.urlretrieve(cleanImgUrl, cleanImgName)
imageType = mimetypes.guess_type(cleanImgUrl)[0]
print imageType
# Add your Site URL and Details
client = Client('http://wp-ep.test/xmlrpc.php', 'admin', 'secret')
data = {
'name': cleanImgName,
'type': imageType,
}
with open(cleanImgName, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
print data
response = client.call(media.UploadFile(data))
return response['id']
# Prepare Object with the scraped data and upload to WordPress
def makeWpPost(prod_title, description, attachment_id):
post = WordPressPost()
post.title = prod_title
post.content = description
post.post_type = "product"
post.post_status = "draft"
post.thumbnail = attachment_id
addpost = client.call(posts.NewPost(post))
print 'Product added: ' + prod_title
print '\n'
# Starts the main function
getProductTitle()