-
Notifications
You must be signed in to change notification settings - Fork 0
/
ecommerce_html.py
39 lines (31 loc) · 1.21 KB
/
ecommerce_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import csv
import requests
# from bs4 import BeautifulSoup
import sys, io
import urllib.request
# from urllib.request import Request, urllib
# finished - loops through csv file (sample), which contains company website urls
# does a google search of the terms of conditions of those companies
# extracts the html of the webpage and saves is either as an html or txt file
try:
from googlesearch import search
except ImportError:
print("No module named 'google' found")
with open('./sample.csv', 'r', encoding='utf-8') as _filehandler:
csv_reader = csv.reader(_filehandler)
URL = ""
for row in csv_reader:
query = "terms and conditions site: " + row[0]
for j in search(query, tld="co.in", num=1, stop=1, pause=5):
URL = j
r = requests.get(URL)
html_name = row[0] + ".html"
txt_file_name = row[0] + ".txt"
# save as txt
with open(txt_file_name, 'w', encoding='utf-8') as file:
file.write(r.text)
file.close()
# save as html
# with open(html_name, "w", encoding = 'utf-8') as file:
# file.write(r.text)
# file.close()