-
Notifications
You must be signed in to change notification settings - Fork 0
/
getReview.py
81 lines (76 loc) · 3.4 KB
/
getReview.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup
import selenium.common.exceptions
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options, executable_path=r"D:\mySoftware\chromedriver.exe")
driver1 = webdriver.Chrome(options=chrome_options, executable_path=r"D:\mySoftware\chromedriver.exe")
neg = 0
url = "https://www.icourse163.org/search.htm?search=%E5%8C%97%E4%BA%AC%E9%82%AE%E7%94%B5%E5%A4%A7%E5%AD%A6#/"
driver.get(url)
context = driver.page_source
soup = BeautifulSoup(context, 'html.parser')
courseLIst = soup.find_all('div', {"id": 'j-courseCardListBox'})
courseList = soup.find_all('div', {"class": 'm-course-list'})
courseList1 = courseList[0].find_all('div', {"class": 'ga-click'})
with open("pos.txt", "a", encoding='utf-8') as f:
for course in courseList1:
courseTag = course.get('data-href')
if courseTag[:16] == "/course/undefine":
continue
url1 = "https://www.icourse163.org" + courseTag
print(url1)
driver1.get(url1)
detail = driver1.page_source
soup1 = BeautifulSoup(detail, 'html.parser')
ele = driver1.find_element_by_id("review-tag-button") # 模仿浏览器点击查看课程评价的功能
ele.click()
try:
nextPage = driver1.find_element_by_class_name("ux-pager_btn__next")
except selenium.common.exceptions.NoSuchElementException as e:
continue
detail = driver1.page_source
soup1 = BeautifulSoup(detail, 'html.parser')
pageNums = driver1.find_element_by_class_name("ux-pager_itm")
pageNums = soup1.find_all('li', {"class": "ux-pager_itm"})
pages = 0
for pOne in pageNums:
aa = pOne.find_all('a')
for a in aa:
if int(a.text) > pages:
pages = int(a.text)
j = 1
while j < pages:
print("课程中的第%d页评论" % j)
connt = driver1.page_source
soup1 = BeautifulSoup(connt, 'html.parser')
comment = soup1.find_all('div',
{
'class': 'ux-mooc-comment-course-comment_comment-list_item_body_content'}) # 包含全部评论项目的标签
stars = soup1.find_all('div',
{'class': 'ux-rating-star'})
index = 1 # stars[0] is stars of course instead of review
if j > 1:
for ctt in comment:
ratePart = stars[index].find_all('i',
{'class': 'ux-icon-custom-rating-favorite'})
rate = len(ratePart)
aspan = ctt.find_all('span')
text = re.sub("\n", "", aspan[0].text)
if rate < 3:
f.write('0' + text + '\n')
neg = neg + 1
elif rate > 3:
f.write('1' + text + '\n')
pos = pos + 1
index = index + 1 # used in star num
nextPage.click()
j = j + 1
if neg > 1000:
break
driver.quit()
driver1.quit()
print(neg)