-
Notifications
You must be signed in to change notification settings - Fork 0
/
ecommerce_h2.py
53 lines (45 loc) · 1.81 KB
/
ecommerce_h2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import requests
from bs4 import BeautifulSoup
import sys, io
from html.parser import HTMLParser
try:
from googlesearch import search
except ImportError:
print("No module named 'google' found")
# alg to extract html file data and convert into txt file
directory = r'C:\Users\lir6\Desktop\ClearTermsWebScraper\Dev'
for entry in os.scandir(directory):
if entry.path.endswith(".html") and entry.is_file():
# enter and extract data
# print(entry.path)
with open(entry.path, 'r', encoding='utf-8') as file:
for j in search(entry.path, tld="co.in", num=1, stop=1, pause=5):
url = j
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
# print("gone into " + entry.path)
txt_file_name = entry.path + '.txt'
with open(txt_file_name, 'w', encoding='utf-8') as txt:
table = soup.find_all("p")
for row in table:
txt.write(row.text.encode('utf-8'))
txt.close()
# table = soup.find_all("p")
# for row in table:
# print(row)
# tag = {}
# tag['content'] = row.text.encode('utf-8')
# p_tag.append(tag)
# for tag in p_tag:
# w.writerow(tag)
# with open(txt_file_name, 'w', encoding='utf-8') as file:
# file.write(r.text)
# file.close()
# ANOTHER WAY TO LOOP THROUGH FILES IN DIRECTORY
# directory = r'C:\Users\lir6\Desktop\ClearTermsWebScraper\Dev'
# for filename in os.listdir(directory):
# if filename.endswith(".html"):
# print(os.path.join(directory, filename))
# else:
# continue