From c07e824398f3030bc633305a9e0f156dc65536f5 Mon Sep 17 00:00:00 2001 From: Jida Li <77774296+jidalii@users.noreply.github.com> Date: Fri, 11 Oct 2024 18:24:55 -0400 Subject: [PATCH 1/4] fix: update ./COLLABORATORS --- COLLABORATORS | 1 + 1 file changed, 1 insertion(+) diff --git a/COLLABORATORS b/COLLABORATORS index a8afc06..22e6b0a 100644 --- a/COLLABORATORS +++ b/COLLABORATORS @@ -1,3 +1,4 @@ Saisriram2003 lcfmarco 739078545 +jidalii \ No newline at end of file From 2282246fc2c68b2b9e646df32eb7c470adac91be Mon Sep 17 00:00:00 2001 From: Jida Li <77774296+jidalii@users.noreply.github.com> Date: Fri, 11 Oct 2024 18:25:32 -0400 Subject: [PATCH 2/4] feat: add cfa event scraping script --- bu_passport/scripts/cfa_event_scraper.py | 305 +++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 bu_passport/scripts/cfa_event_scraper.py diff --git a/bu_passport/scripts/cfa_event_scraper.py b/bu_passport/scripts/cfa_event_scraper.py new file mode 100644 index 0000000..93a28e2 --- /dev/null +++ b/bu_passport/scripts/cfa_event_scraper.py @@ -0,0 +1,305 @@ +import re +import requests +from datetime import datetime +import hashlib + +from dataclasses import dataclass, field +from typing import List, Optional, Tuple + +import firebase_admin +from firebase_admin import credentials, firestore + +from bs4 import BeautifulSoup +import pytz +from urllib.parse import urlparse, parse_qs + + +@dataclass +class CFAEvent: + event_id: str = "" + event_id_hex: str = "" + title: Optional[str] = "" + description: Optional[str] = "" + categories: List[str] = field(default_factory=list) + location: Optional[str] = "" + photo: Optional[str] = "" + points: int = 0 # Default points to 0 + start_time: Optional[datetime] = None + end_time: Optional[datetime] = None + event_url: Optional[str] = "" + detail_url: Optional[str] = "" + + def to_dict(self) -> dict: + return { + "eventID": self.event_id, + "eventTitle": self.title, + "eventCategories": self.categories, + "eventLocation": self.location, + "eventStartTime": self.start_time, + "eventEndTime": self.end_time, + "eventURL": self.event_url, + "eventDescription": self.description, + "eventPhoto": self.photo, + "eventPoints": 30, + "savedUsers": [], + } + + def write_event_id_hex(self): + hash_object = hashlib.sha256() + + # Encode the event_id and update the hash object + str_combined = f"{self.event_id}{self.start_time}" + hash_object.update(str_combined.encode('utf-8')) + + # Get the hexadecimal representation of the hash + self.event_id_hex = hash_object.hexdigest() + +def scrape_raw_events(soup: BeautifulSoup) -> str: + raw_events = ( + soup.find("div", class_="wrapper") + .find("main", class_="content") + .find("div", class_="content-container") + .find("section", class_=lambda x: x and "bulp-events" in x) + .find("div", class_=lambda x: x and "bulp-content" in x) + .find("div", class_=lambda x: x and "bulp-container" in x) + .find_all("article") + ) + return raw_events + + +def scrape_event_categories(raw_event: str) -> list[str]: + try: + # find categories + raw_topic_span = raw_event.find( + "span", class_=lambda x: x and "bulp-event-topic" in x + ) + if raw_topic_span: + event_topics = raw_topic_span.find_all("span", class_="bulp-event-topic-text") + categories = [event.text for event in event_topics] + return categories + return [] + except: + return [] + + +def scrape_event_title(raw_event: str) -> str | None: + try: + raw_span = raw_event.find("h3", class_="bulp-event-title") + if not raw_span: + return None + parsed_title = " ".join([word.strip() for word in raw_span.text.split(" ")]) + return parsed_title + except: + return None + + +def scrape_event_datetime( + raw_event: str, +) -> Tuple[datetime, datetime] | Tuple[None, None]: + raw_when_span = raw_event.find("span", class_="bulp-event-when") + + def parse_date(html: str) -> str | None: + try: + raw_date = html.find("span", class_="bulp-event-meta-date") + event_days_of_week = raw_date.find(class_="bulp-event-day") + event_month = raw_date.find(class_="bulp-event-month") + event_day = raw_date.find(class_="bulp-event-date") + event_date = ( + f"{event_days_of_week.text} {event_month.text} {event_day.text}" + ) + return event_date + except Exception as e: + print("parse_date:", e) + return None + + def parse_time(html: str) -> Tuple[str, str] | Tuple[None, None]: + try: + raw_time: str = ( + html.find("span", class_="bulp-event-meta-time") + .find("span", class_="bulp-event-time") + .text.strip() + ) + if raw_time.lower() == "all day": + return "12:00am", "11:59pm" + start_time, end_time = (time.strip() for time in raw_time.split("-")) + return start_time, end_time + except Exception as e: + print("parse_time:", e) + return None, None + + def parse_daytime_range( + start_daytime: str, end_daytime: str + ) -> Tuple[datetime, datetime] | Tuple[None, None]: + boston_tz = pytz.timezone("America/New_York") + cur_time = datetime.now(boston_tz) + cur_month = cur_time.month + cur_year = cur_time.year + + def parse_daytime(date_str: str) -> datetime | None: + try: + # Remove ordinal suffixes (e.g., '12th' -> '12') + cleaned_date_str = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", date_str) + parsed_date = datetime.strptime(cleaned_date_str, "%A %b %d %I:%M%p") + return boston_tz.localize(parsed_date) + except Exception as e: + print("parse_daytime:", e) + return None + + try: + start = parse_daytime(start_daytime) + end = parse_daytime(end_daytime) + if start is None or end is None: + print("empty daytime") + return start, end + + # Adjust the year based on the current month + if start.month >= cur_month: + start = start.replace(year=cur_year) + else: + start = start.replace(year=cur_year + 1) + + if end.month >= cur_month: + end = end.replace(year=cur_year) + else: + end = end.replace(year=cur_year + 1) + + return start, end + except Exception as e: + print("parse_daytime_range:", e) + return None, None + + # find date + event_date = parse_date(raw_when_span) + start_time, end_time = parse_time(raw_when_span) + + start_daytime = f"{event_date} {start_time}" + end_daytime = f"{event_date} {end_time}" + return parse_daytime_range(start_daytime, end_daytime) + + +def scrape_event_location(raw_event: str) -> str | None: + span: str = raw_event.find("span", class_="bulp-event-where") + if not span: + return None + else: + return span.text.strip() + + +def scrape_event_detail_link(raw_event: str) -> Tuple[str, str] | Tuple[None, None]: + try: + span: str = raw_event.find("div", class_="bulp-event-buttons") + if not span: + return None, None + a_tag = span.find("a", class_="bulp-event-readmore") + href = a_tag["href"] + + parsed_url = urlparse(href) + query_params = parse_qs(parsed_url.query) + eid = query_params.get("eid", [None])[0] + return f"https://www.bu.edu{href}", eid + except: + return None, None + + +def scrape_event_detail_page(soup: BeautifulSoup): + return ( + soup.find("div", class_="wrapper") + .find("main", class_="content") + .find("div", class_="content-container-narrow") + .find("article") + .find("div", class_="single-event") + ) + + +def scrape_event_image(raw_detail: str) -> str | None: + try: + raw_figure = ( + raw_detail.find("div", class_="single-event-summary") + .find("div", class_="single-event-thumbnail") + .find("img") + ) + return f"https://www.bu.edu/{raw_figure['src']}" + except: + return None + + +def scrape_event_description(raw_detail) -> str | None: + try: + raw_summary = raw_detail.find("div", class_="single-event-description") + text_content = raw_summary.get_text(separator=" ", strip=True) + return text_content + except: + return None + + +def scrape_event_event_link(raw_detail) -> str | None: + try: + dd_tag = ( + raw_detail.find("div", class_="single-event-additional-details") + .find("dl", class_="tabular") + .find("dd", class_="single-event-info-url") + ) + if not dd_tag: + return None + + url = dd_tag.find("a") + return url["href"] + except: + return None + + +def main(): + cred = credentials.Certificate("../serviceAccountKey.json") + firebase_admin.initialize_app(cred) + db = firestore.client() + + url = "https://www.bu.edu/cfa/news/bu-arts-initiative/" + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + + raw_events = scrape_raw_events(soup) + cfa_events: list[CFAEvent] = [] + # Iterate through each slick-slide and extract content + for raw_event in raw_events: + try: + cfa_event = CFAEvent() + raw_event = raw_event.find( + "div", class_=lambda x: x and "bulp-item-content" in x + ) + cfa_event.categories = scrape_event_categories(raw_event) + cfa_event.title = scrape_event_title(raw_event) + + cfa_event.start_time, cfa_event.end_time = scrape_event_datetime(raw_event) + cfa_event.location = scrape_event_location(raw_event) + cfa_event.detail_url, cfa_event.event_id = scrape_event_detail_link( + raw_event + ) + cfa_event.write_event_id_hex() + + cfa_events.append(cfa_event) + except Exception as e: + print(f"Error extracting slide data: {e}") + + for i, event in enumerate(cfa_events): + if not event.detail_url: + continue + response = requests.get(event.detail_url) + soup = BeautifulSoup(response.content, "html.parser") + raw_detail = scrape_event_detail_page(soup) + + event.photo = scrape_event_image(raw_detail) + event.description = scrape_event_description(raw_detail) + event.event_url = scrape_event_event_link(raw_detail) + + for i, event in enumerate(cfa_events): + + doc_ref = db.collection("test_events").document(event.event_id_hex) + + if doc_ref.get().exists: + print(f"Updating event with pk {event.event_id_hex} in db") + doc_ref.set(event.to_dict(), merge=True) + else: + print(f"Adding event with pk {event.event_id_hex} in db") + doc_ref.set(event.to_dict()) + +main() From 087a395f892cdbd89080c7476d1d32bbb378459f Mon Sep 17 00:00:00 2001 From: Jida Li <77774296+jidalii@users.noreply.github.com> Date: Wed, 16 Oct 2024 11:11:53 -0400 Subject: [PATCH 3/4] refactor: optimized event scraper --- bu_passport/scripts/cfa_event_scraper.py | 24 +++++++++++++++++++----- bu_passport/scripts/requirements.txt | 4 ++++ 2 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 bu_passport/scripts/requirements.txt diff --git a/bu_passport/scripts/cfa_event_scraper.py b/bu_passport/scripts/cfa_event_scraper.py index 93a28e2..5533b87 100644 --- a/bu_passport/scripts/cfa_event_scraper.py +++ b/bu_passport/scripts/cfa_event_scraper.py @@ -43,6 +43,20 @@ def to_dict(self) -> dict: "eventPoints": 30, "savedUsers": [], } + + def to_dict_exist(self) -> dict: + return { + "eventID": self.event_id, + "eventTitle": self.title, + "eventCategories": self.categories, + "eventLocation": self.location, + "eventStartTime": self.start_time, + "eventEndTime": self.end_time, + "eventURL": self.event_url, + "eventDescription": self.description, + "eventPhoto": self.photo, + "eventPoints": 30, + } def write_event_id_hex(self): hash_object = hashlib.sha256() @@ -248,7 +262,7 @@ def scrape_event_event_link(raw_detail) -> str | None: return None -def main(): +def main(table_name: str): cred = credentials.Certificate("../serviceAccountKey.json") firebase_admin.initialize_app(cred) db = firestore.client() @@ -280,7 +294,7 @@ def main(): except Exception as e: print(f"Error extracting slide data: {e}") - for i, event in enumerate(cfa_events): + for _, event in enumerate(cfa_events): if not event.detail_url: continue response = requests.get(event.detail_url) @@ -293,13 +307,13 @@ def main(): for i, event in enumerate(cfa_events): - doc_ref = db.collection("test_events").document(event.event_id_hex) + doc_ref = db.collection(table_name).document(event.event_id_hex) if doc_ref.get().exists: print(f"Updating event with pk {event.event_id_hex} in db") - doc_ref.set(event.to_dict(), merge=True) + doc_ref.set(event.to_dict_exist(), merge=True) else: print(f"Adding event with pk {event.event_id_hex} in db") doc_ref.set(event.to_dict()) -main() +main("test_events") diff --git a/bu_passport/scripts/requirements.txt b/bu_passport/scripts/requirements.txt new file mode 100644 index 0000000..696f964 --- /dev/null +++ b/bu_passport/scripts/requirements.txt @@ -0,0 +1,4 @@ +requests +firebase-admin +beautifulsoup4 +pytz From 382b195cfdcb8911da32ccdb065fc88cc63fa489 Mon Sep 17 00:00:00 2001 From: Jida Li <77774296+jidalii@users.noreply.github.com> Date: Thu, 24 Oct 2024 10:52:59 -0400 Subject: [PATCH 4/4] refactor: solved issues in comments --- bu_passport/scripts/cfa_event_scraper.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/bu_passport/scripts/cfa_event_scraper.py b/bu_passport/scripts/cfa_event_scraper.py index 5533b87..9340d97 100644 --- a/bu_passport/scripts/cfa_event_scraper.py +++ b/bu_passport/scripts/cfa_event_scraper.py @@ -29,7 +29,7 @@ class CFAEvent: event_url: Optional[str] = "" detail_url: Optional[str] = "" - def to_dict(self) -> dict: + def to_dict_with_empty_users(self) -> dict: return { "eventID": self.event_id, "eventTitle": self.title, @@ -44,7 +44,7 @@ def to_dict(self) -> dict: "savedUsers": [], } - def to_dict_exist(self) -> dict: + def to_dict(self) -> dict: return { "eventID": self.event_id, "eventTitle": self.title, @@ -215,7 +215,7 @@ def scrape_event_detail_link(raw_event: str) -> Tuple[str, str] | Tuple[None, No return None, None -def scrape_event_detail_page(soup: BeautifulSoup): +def scrape_detail_page(soup: BeautifulSoup): return ( soup.find("div", class_="wrapper") .find("main", class_="content") @@ -266,6 +266,8 @@ def main(table_name: str): cred = credentials.Certificate("../serviceAccountKey.json") firebase_admin.initialize_app(cred) db = firestore.client() + + print("Starting scraper") url = "https://www.bu.edu/cfa/news/bu-arts-initiative/" response = requests.get(url) @@ -299,21 +301,23 @@ def main(table_name: str): continue response = requests.get(event.detail_url) soup = BeautifulSoup(response.content, "html.parser") - raw_detail = scrape_event_detail_page(soup) + raw_detail = scrape_detail_page(soup) event.photo = scrape_event_image(raw_detail) event.description = scrape_event_description(raw_detail) event.event_url = scrape_event_event_link(raw_detail) + # update firebase db for i, event in enumerate(cfa_events): doc_ref = db.collection(table_name).document(event.event_id_hex) if doc_ref.get().exists: print(f"Updating event with pk {event.event_id_hex} in db") - doc_ref.set(event.to_dict_exist(), merge=True) + doc_ref.set(event.to_dict(), merge=True) else: print(f"Adding event with pk {event.event_id_hex} in db") - doc_ref.set(event.to_dict()) + doc_ref.set(event.to_dict_with_empty_users()) + print("Event Scraping has completed") main("test_events")