-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathpyfile_web_scraping.py
124 lines (90 loc) · 4.2 KB
/
pyfile_web_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
## Imports
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common import exceptions
import time
import io
import pandas as pd
import numpy as np
import csv
import sys
import os # For cloud
## function definition
def scrapfyt(url):
## Opening chrome and url
option = webdriver.ChromeOptions()
option.binary_location = os.environ.get("GOOGLE_CHROME_BIN") # For cloud
option.add_argument('--headless')
option.add_argument('-no-sandbox')
# option.add_argument("--disable-infobars")
# option.add_argument("--disable-gpu")
option.add_argument("--mute-audio")
option.add_argument("--disable-extensions")
option.add_argument('-disable-dev-shm-usage')
# driver = webdriver.Chrome(service=Service("C:/chrome extension/chromedriver.exe"), options=option) # For testing in windows
driver = webdriver.Chrome(service = Service(executable_path = os.environ.get("CHROMEDRIVER_PATH")), options = option) # For cloud
driver.set_window_size(960, 800) # minimizing window to optimum because of youtube design of
# right side videos recommendations. When in max window,
# while scrolling comments, it cannot be able to load correctly
# due to the video recommendations in the right side.
time.sleep(1)
driver.get(url)
time.sleep(2)
## Pause youtube video
pause = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'ytp-play-button')))
pause.click()
time.sleep(0.2)
pause.click()
time.sleep(4)
## Scrolling through all the comments
# Initial Scroll
driver.execute_script("window.scrollBy(0,500)","")
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
# Scroll down till "next load".
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
# Wait to load everything thus far.
time.sleep(4)
# Calculate new scroll height and compare with last scroll height.
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# One last scroll just in case
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
## Scraping details
## title of video
video_title = driver.find_element(By.NAME, 'title').get_attribute('content')
## owner of video
video_owner1 = driver.find_elements(By.XPATH, '//*[@id="text"]/a')
video_owner = []
for owner in video_owner1:
video_owner.append(owner.text)
video_owner = video_owner[0]
# total comments with replies
video_comment_with_replies = driver.find_element(By.XPATH, '//*[@id="count"]/yt-formatted-string/span[1]').text + ' Comments'
## Scraping all the comments
users = driver.find_elements(By.XPATH, '//*[@id="author-text"]/span')
comments = driver.find_elements(By.XPATH, '//*[@id="content-text"]')
with io.open('comments.csv', 'w', newline='', encoding="utf-16") as file:
writer = csv.writer(file, delimiter =",", quoting=csv.QUOTE_ALL)
writer.writerow(["Username", "Comment"])
for username, comment in zip(users, comments):
writer.writerow([username.text, comment.text])
commentsfile = pd.read_csv("comments.csv", encoding ="utf-16") # , encoding ="utf-16", engine = 'python'
all_comments = commentsfile.replace(np.nan, '-', regex = True)
all_comments = all_comments.to_csv("Full Comments.csv", index = False)
# comments.to_html("Comments2.html")
##total comments without replies
video_comment_without_replies = str(len(commentsfile.axes[0])) + ' Comments'
# print(video_title, video_owner, video_comment_with_replies, video_comment_without_replies)
## Close driver
driver.close()
# print("Scraping is finished")
## return fuction
return all_comments, video_title, video_owner, video_comment_with_replies, video_comment_without_replies