forked from scottylabs-labrador/BoilerGramWeb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraperTemplate.py
29 lines (23 loc) · 937 Bytes
/
scraperTemplate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
//Import statements for webscraping
//requests used for link formatting for webscraper
import requests
//uses beautiful soup as webscraper
from bs4 import BeautifulSoup
//defining url to be scraped
url = 'https://www.cmu.edu/math/research/index.html'
//arrays to store scraped data before (potentially) moved elsewhere
professor_links = []
professor_names = []
//rewritting url from string for beautiful soup
page = requests.get(url)
//parsing through html code in the website
soup = BeautifulSoup(page.content, "html.parser")
//iterating through all tags (<b>,<a>,etc.) with the specific attribute 'class:'cta''
for vals in soup.find_all('a', attrs={'class':'cta'}):
//appending link to links array
professor_links.append(vals['href'])
//appending displayed text (professor's name) to name array
professor_names.append(vals.string)
//printing out first values in each array
print(professor_links[0])
print(professor_names[0])