diff --git a/bu_passport/scripts/bu_event_calendar_scraper.py b/bu_passport/scripts/bu_event_calendar_scraper.py new file mode 100644 index 0000000..eda6c52 --- /dev/null +++ b/bu_passport/scripts/bu_event_calendar_scraper.py @@ -0,0 +1,306 @@ +from email.contentmanager import raw_data_manager +import requests +from datetime import datetime + +from dataclasses import dataclass, field +from typing import List, Optional, Tuple, Dict + +import firebase_admin +from firebase_admin import credentials, firestore + +from bs4 import BeautifulSoup +import pytz +from urllib.parse import urlparse, parse_qs + + +@dataclass +class EventSession: + session_id: str = "" + start_time: Optional[datetime] = None + end_time: Optional[datetime] = None + + def to_dict(self): + return { + "sessionId": self.session_id, + "startTime": self.start_time, + "endTime": self.end_time, + "savedUsers": [], + } + + +@dataclass +class CFAEvent: + event_id: str = "" + title: Optional[str] = None + description: Optional[str] = None + categories: List[str] = field(default_factory=list) + location: Optional[str] = None + photo: Optional[str] = None + points: int = 0 # Default points to 0 + event_url: Optional[str] = None + detail_url: Optional[str] = None + sessions: Dict[str, "EventSession"] = field(default_factory=dict) + + def to_dict(self): + return { + "eventID": self.event_id, + "eventTitle": self.title, + "eventCategories": [], + "eventLocation": self.location, + "eventURL": self.event_url, + "eventDescription": self.description, + "eventPhoto": self.photo, + "eventPoints": 0, + "eventSessions": { + session_id: session.to_dict() + for session_id, session in self.sessions.items() + }, + } + + def to_dict_no_sessions(self): + return { + "eventID": self.event_id, + "eventTitle": self.title, + "eventLocation": self.location, + "eventURL": self.event_url, + "eventDescription": self.description, + "eventPhoto": self.photo, + "eventPoints": 0, + } + + +def scrape_raw_events_ls(soup: BeautifulSoup) -> str: + raw_events_ls = ( + soup.find("div", class_="wrapper") + .find("main", class_="content") + .find("div", class_="content-container") + .find("article") + # .find("div", class_=lambda x: x and "bulp-content" in x) + # .find("div", class_=lambda x: x and "bulp-container" in x) + .find_all("ul", class_="calendar-list-events") + ) + return raw_events_ls + + +def scrape_raw_events(content): + raw_events = content.find_all("li", class_="calendar-list-event") + return raw_events + + +def scrape_event_image(content): + img_tag = content.find("img") + img_link = img_tag["srcset"].split(", ")[0].split(" ")[0] if img_tag else None + return img_link + + +def scrape_event_detail_link(content: str) -> Tuple[str, str] | Tuple[None, None]: + try: + event_details_tag = content.find("a", class_="bulp-event-readmore") + url = event_details_tag["href"] if event_details_tag else None + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + eid = query_params.get("eid", [None])[0] + oid = query_params.get("oid", [None])[0] + return url, eid, oid + except: + return None, None, None + + +def get_session_id_from_url(url): + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + return query_params.get("oid", [None])[0] + + +def scrape_event_title(content): + event_title_tag = content.find("div", class_="calendar-list-event-link") + event_title = event_title_tag.text if event_title_tag else None + return event_title + + +def scrape_detail_page(soup: BeautifulSoup): + return ( + soup.find("div", class_="wrapper") + .find("main", class_="content") + .find("div", class_="content-container-narrow") + .find("article") + .find("div", class_="single-event") + ) + + +def scrape_event_description(raw_detail) -> str | None: + try: + raw_summary = raw_detail.find("div", class_="single-event-description") + text_content = raw_summary.get_text(separator=" ", strip=True) + return text_content + except: + return None + + +def scrape_event_event_link(raw_detail) -> str | None: + try: + dd_tag = ( + raw_detail.find("div", class_="single-event-additional-details") + .find("dl", class_="tabular") + .find("dd", class_="single-event-info-url") + ) + if not dd_tag: + return None + + url = dd_tag.find("a") + return url["href"] + except: + return None + + +def scrape_session_location(raw_detail): + try: + # Find the dd tag with the location class + dd_tag = raw_detail.find("dd", class_="single-event-info-location") + + # Check if the dd tag exists and return its text + if dd_tag: + return dd_tag.text.strip() + return None + except Exception as e: + print(f"Error occurred: {e}") + return None + + +def scrape_session_datetime(raw_detail): + def parse_datetime(start_date, start_time, end_date, end_time): + # Define Boston timezone + try: + boston_tz = pytz.timezone("America/New_York") + # Parse the start and end times into naive datetime objects + start_time_naive = datetime.strptime( + f"{start_date} {start_time}", "%A, %B %d, %Y %I:%M %p" + ) + end_time_naive = datetime.strptime( + f"{end_date} {end_time}", "%A, %B %d, %Y %I:%M %p" + ) + + # Localize them to Boston timezone to make them timezone-aware + start_time = boston_tz.localize(start_time_naive) + end_time = boston_tz.localize(end_time_naive) + return start_time, end_time + except Exception as e: + print(e) + return None, None + + all_day_tag = raw_detail.find("li", class_="single-event-schedule-allday") + if all_day_tag: + date_text = all_day_tag.find("span", class_="single-event-date").text + return parse_datetime(date_text, "12:00 am", date_text, "11:59 pm") + else: + start_tag = raw_detail.find("li", class_="single-event-schedule-start") + end_tag = raw_detail.find("li", class_="single-event-schedule-end") + if start_tag and end_tag: + start_time = start_tag.find("span", class_="single-event-time").text + start_date = start_tag.find("span", class_="single-event-date").text + + end_time = end_tag.find("span", class_="single-event-time").text + end_date = end_tag.find("span", class_="single-event-date").text + return parse_datetime(start_date, start_time, end_date, end_time) + return None, None + + +def initialize_firestore() -> firestore.client: + """Initialize Firestore client.""" + cred = credentials.Certificate("../serviceAccountKey.json") + firebase_admin.initialize_app(cred) + return firestore.client() + + +def update_database(db, cfa_events, table_name): + for _, event in enumerate(cfa_events): + + doc_ref = db.collection(table_name).document(event.event_id) + doc = doc_ref.get() + + if doc.exists: + print(f"Updating event with pk {event.event_id} in db") + existing_data = doc.to_dict() + + # Update only the new sessions in eventSessions + existing_sessions = existing_data.get("eventSessions", {}) + updated_sessions = event.sessions.copy() + + # Skip existing sessions + for session_id in existing_sessions: + updated_sessions.pop(session_id, None) + + # Merge event data without eventSessions + event_dict = event.to_dict() + if updated_sessions: + existing_sessions.update({ + session_id: session.to_dict() + for session_id, session in updated_sessions.items() + }) + doc_ref.set({"eventSessions": existing_sessions}, merge=True) + else: + event_dict.pop("eventSessions", None) # Exclude if no new sessions + + # Merge attributes without overwriting eventSessions + doc_ref.set(event.to_dict_no_sessions(), merge=True) + + else: + print(f"Adding event with pk {event.event_id} in db") + doc_ref.set(event.to_dict()) + + +def main(table_name): + db = initialize_firestore() + print("Starting scraper") + + url = "https://www.bu.edu/cfa/news/calendar/?amp%3B&topic=8639&date=20241024" + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + + cfa_events: list[CFAEvent] = [] + + raw_events_ls = scrape_raw_events_ls(soup) + for _, _raw_events in enumerate(raw_events_ls): + raws_events = scrape_raw_events(_raw_events) + for raw_event in raws_events: + cfa_event = CFAEvent() + + cfa_event.photo = scrape_event_image(raw_event) + cfa_event.detail_url, cfa_event.event_id, session_id = ( + scrape_event_detail_link(raw_event) + ) + if cfa_event.event_id: + if not session_id: + session_id = "0" + cfa_event.sessions[session_id] = EventSession() + cfa_event.sessions[session_id].session_id = session_id + + cfa_event.title = scrape_event_title(raw_event) + + cfa_events.append(cfa_event) + + for event in cfa_events: + if not event.detail_url: + continue + response = requests.get(event.detail_url) + soup = BeautifulSoup(response.content, "html.parser") + raw_detail = scrape_detail_page(soup) + + event.description = scrape_event_description(raw_detail) + event.event_url = scrape_event_event_link(raw_detail) + event.location = scrape_session_location(raw_detail) + session_id = get_session_id_from_url(event.detail_url) + # print(event.detail_url) + if not session_id: + session_id = "0" + event.sessions[session_id].start_time, event.sessions[session_id].end_time = ( + scrape_session_datetime(raw_detail) + ) + print(event.location) + + update_database(db, cfa_events, table_name) + + print("Event Scraping has completed") + + +main("new_events1") diff --git a/bu_passport/scripts/cfa_event_scraper.py b/bu_passport/scripts/cfa_event_scraper.py index 9340d97..63d84b5 100644 --- a/bu_passport/scripts/cfa_event_scraper.py +++ b/bu_passport/scripts/cfa_event_scraper.py @@ -1,10 +1,8 @@ -import re import requests from datetime import datetime -import hashlib from dataclasses import dataclass, field -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict import firebase_admin from firebase_admin import credentials, firestore @@ -14,59 +12,76 @@ from urllib.parse import urlparse, parse_qs +def get_session_id_from_url(url): + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + return query_params.get("oid", [None])[0] + + +@dataclass +class EventSession: + session_id: str = "" + start_time: Optional[datetime] = None + end_time: Optional[datetime] = None + + def to_dict(self): + return { + "sessionId": self.session_id, + "startTime": self.start_time, + "endTime": self.end_time, + "savedUsers": [], + } + + @dataclass class CFAEvent: event_id: str = "" - event_id_hex: str = "" - title: Optional[str] = "" - description: Optional[str] = "" + title: Optional[str] = None + description: Optional[str] = None categories: List[str] = field(default_factory=list) - location: Optional[str] = "" - photo: Optional[str] = "" + location: Optional[str] = None + photo: Optional[str] = None points: int = 0 # Default points to 0 start_time: Optional[datetime] = None end_time: Optional[datetime] = None - event_url: Optional[str] = "" - detail_url: Optional[str] = "" + event_url: Optional[str] = None + detail_url: Optional[str] = None + sessions: Dict[str, "EventSession"] = field(default_factory=dict) - def to_dict_with_empty_users(self) -> dict: + def to_dict(self) -> dict: return { "eventID": self.event_id, "eventTitle": self.title, "eventCategories": self.categories, "eventLocation": self.location, - "eventStartTime": self.start_time, - "eventEndTime": self.end_time, "eventURL": self.event_url, "eventDescription": self.description, "eventPhoto": self.photo, "eventPoints": 30, - "savedUsers": [], + "eventSessions": { + session_id: session.to_dict() + for session_id, session in self.sessions.items() + }, } - - def to_dict(self) -> dict: + + def to_dict_no_sessions(self): return { "eventID": self.event_id, "eventTitle": self.title, "eventCategories": self.categories, "eventLocation": self.location, - "eventStartTime": self.start_time, - "eventEndTime": self.end_time, "eventURL": self.event_url, "eventDescription": self.description, "eventPhoto": self.photo, - "eventPoints": 30, + "eventPoints": 0, } - def write_event_id_hex(self): - hash_object = hashlib.sha256() - # Encode the event_id and update the hash object - str_combined = f"{self.event_id}{self.start_time}" - hash_object.update(str_combined.encode('utf-8')) +def fetch_and_parse_url(url: str) -> BeautifulSoup: + """Fetch content from a URL and parse it with BeautifulSoup.""" + response = requests.get(url) + return BeautifulSoup(response.content, "html.parser") - # Get the hexadecimal representation of the hash - self.event_id_hex = hash_object.hexdigest() def scrape_raw_events(soup: BeautifulSoup) -> str: raw_events = ( @@ -88,13 +103,15 @@ def scrape_event_categories(raw_event: str) -> list[str]: "span", class_=lambda x: x and "bulp-event-topic" in x ) if raw_topic_span: - event_topics = raw_topic_span.find_all("span", class_="bulp-event-topic-text") + event_topics = raw_topic_span.find_all( + "span", class_="bulp-event-topic-text" + ) categories = [event.text for event in event_topics] return categories return [] except: return [] - + def scrape_event_title(raw_event: str) -> str | None: try: @@ -107,88 +124,42 @@ def scrape_event_title(raw_event: str) -> str | None: return None -def scrape_event_datetime( - raw_event: str, -) -> Tuple[datetime, datetime] | Tuple[None, None]: - raw_when_span = raw_event.find("span", class_="bulp-event-when") - - def parse_date(html: str) -> str | None: +def scrape_session_datetime(raw_detail): + def parse_datetime(start_date, start_time, end_date, end_time): + # Define Boston timezone try: - raw_date = html.find("span", class_="bulp-event-meta-date") - event_days_of_week = raw_date.find(class_="bulp-event-day") - event_month = raw_date.find(class_="bulp-event-month") - event_day = raw_date.find(class_="bulp-event-date") - event_date = ( - f"{event_days_of_week.text} {event_month.text} {event_day.text}" + boston_tz = pytz.timezone("America/New_York") + # Parse the start and end times into naive datetime objects + start_time_naive = datetime.strptime( + f"{start_date} {start_time}", "%A, %B %d, %Y %I:%M %p" ) - return event_date - except Exception as e: - print("parse_date:", e) - return None - - def parse_time(html: str) -> Tuple[str, str] | Tuple[None, None]: - try: - raw_time: str = ( - html.find("span", class_="bulp-event-meta-time") - .find("span", class_="bulp-event-time") - .text.strip() + end_time_naive = datetime.strptime( + f"{end_date} {end_time}", "%A, %B %d, %Y %I:%M %p" ) - if raw_time.lower() == "all day": - return "12:00am", "11:59pm" - start_time, end_time = (time.strip() for time in raw_time.split("-")) - return start_time, end_time - except Exception as e: - print("parse_time:", e) - return None, None - def parse_daytime_range( - start_daytime: str, end_daytime: str - ) -> Tuple[datetime, datetime] | Tuple[None, None]: - boston_tz = pytz.timezone("America/New_York") - cur_time = datetime.now(boston_tz) - cur_month = cur_time.month - cur_year = cur_time.year - - def parse_daytime(date_str: str) -> datetime | None: - try: - # Remove ordinal suffixes (e.g., '12th' -> '12') - cleaned_date_str = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", date_str) - parsed_date = datetime.strptime(cleaned_date_str, "%A %b %d %I:%M%p") - return boston_tz.localize(parsed_date) - except Exception as e: - print("parse_daytime:", e) - return None - - try: - start = parse_daytime(start_daytime) - end = parse_daytime(end_daytime) - if start is None or end is None: - print("empty daytime") - return start, end - - # Adjust the year based on the current month - if start.month >= cur_month: - start = start.replace(year=cur_year) - else: - start = start.replace(year=cur_year + 1) - - if end.month >= cur_month: - end = end.replace(year=cur_year) - else: - end = end.replace(year=cur_year + 1) - - return start, end + # Localize them to Boston timezone to make them timezone-aware + start_time = boston_tz.localize(start_time_naive) + end_time = boston_tz.localize(end_time_naive) + return start_time, end_time except Exception as e: - print("parse_daytime_range:", e) + print(e) return None, None - # find date - event_date = parse_date(raw_when_span) - start_time, end_time = parse_time(raw_when_span) + all_day_tag = raw_detail.find("li", class_="single-event-schedule-allday") + if all_day_tag: + date_text = all_day_tag.find("span", class_="single-event-date").text + return parse_datetime(date_text, "12:00 am", date_text, "11:59 pm") + else: + start_tag = raw_detail.find("li", class_="single-event-schedule-start") + end_tag = raw_detail.find("li", class_="single-event-schedule-end") + if start_tag and end_tag: + start_time = start_tag.find("span", class_="single-event-time").text + start_date = start_tag.find("span", class_="single-event-date").text - start_daytime = f"{event_date} {start_time}" - end_daytime = f"{event_date} {end_time}" - return parse_daytime_range(start_daytime, end_daytime) + end_time = end_tag.find("span", class_="single-event-time").text + end_date = end_tag.find("span", class_="single-event-date").text + return parse_datetime(start_date, start_time, end_date, end_time) + return None, None def scrape_event_location(raw_event: str) -> str | None: @@ -203,16 +174,17 @@ def scrape_event_detail_link(raw_event: str) -> Tuple[str, str] | Tuple[None, No try: span: str = raw_event.find("div", class_="bulp-event-buttons") if not span: - return None, None + return None, None, None a_tag = span.find("a", class_="bulp-event-readmore") href = a_tag["href"] parsed_url = urlparse(href) query_params = parse_qs(parsed_url.query) eid = query_params.get("eid", [None])[0] - return f"https://www.bu.edu{href}", eid + oid = query_params.get("oid", [None])[0] + return f"https://www.bu.edu{href}", eid, oid except: - return None, None + return None, None, None def scrape_detail_page(soup: BeautifulSoup): @@ -262,62 +234,124 @@ def scrape_event_event_link(raw_detail) -> str | None: return None -def main(table_name: str): - cred = credentials.Certificate("../serviceAccountKey.json") - firebase_admin.initialize_app(cred) - db = firestore.client() - - print("Starting scraper") +def extract_event_data(raw_event) -> CFAEvent: + """Extract data from a raw event and return a CFAEvent object.""" + cfa_event = CFAEvent() + raw_event = raw_event.find("div", class_=lambda x: x and "bulp-item-content" in x) - url = "https://www.bu.edu/cfa/news/bu-arts-initiative/" - response = requests.get(url) - soup = BeautifulSoup(response.content, "html.parser") + cfa_event.categories = scrape_event_categories(raw_event) + cfa_event.title = scrape_event_title(raw_event) + cfa_event.location = scrape_event_location(raw_event) + + cfa_event.detail_url, cfa_event.event_id, session_id = scrape_event_detail_link( + raw_event + ) + if session_id: + cfa_event.sessions[session_id] = EventSession(session_id=session_id) + cfa_event.sessions[session_id].session_id = session_id + return cfa_event + + +def scrape_events(soup: BeautifulSoup) -> List[CFAEvent]: + """Scrape events from the soup and return a list of CFAEvent objects.""" raw_events = scrape_raw_events(soup) - cfa_events: list[CFAEvent] = [] - # Iterate through each slick-slide and extract content + cfa_events: List[CFAEvent] = [] + for raw_event in raw_events: try: - cfa_event = CFAEvent() - raw_event = raw_event.find( - "div", class_=lambda x: x and "bulp-item-content" in x - ) - cfa_event.categories = scrape_event_categories(raw_event) - cfa_event.title = scrape_event_title(raw_event) - - cfa_event.start_time, cfa_event.end_time = scrape_event_datetime(raw_event) - cfa_event.location = scrape_event_location(raw_event) - cfa_event.detail_url, cfa_event.event_id = scrape_event_detail_link( - raw_event - ) - cfa_event.write_event_id_hex() - + cfa_event = extract_event_data(raw_event) cfa_events.append(cfa_event) except Exception as e: - print(f"Error extracting slide data: {e}") + print(f"Error extracting event data: {e}") + + return cfa_events + +def initialize_firestore() -> firestore.client: + """Initialize Firestore client.""" + cred = credentials.Certificate("../serviceAccountKey.json") + firebase_admin.initialize_app(cred) + return firestore.client() + + +def update_database(db, cfa_events, table_name): for _, event in enumerate(cfa_events): - if not event.detail_url: - continue - response = requests.get(event.detail_url) - soup = BeautifulSoup(response.content, "html.parser") - raw_detail = scrape_detail_page(soup) - - event.photo = scrape_event_image(raw_detail) - event.description = scrape_event_description(raw_detail) - event.event_url = scrape_event_event_link(raw_detail) - - # update firebase db - for i, event in enumerate(cfa_events): - - doc_ref = db.collection(table_name).document(event.event_id_hex) - - if doc_ref.get().exists: - print(f"Updating event with pk {event.event_id_hex} in db") - doc_ref.set(event.to_dict(), merge=True) + + doc_ref = db.collection(table_name).document(event.event_id) + doc = doc_ref.get() + + if doc.exists: + print(f"Updating event with pk {event.event_id} in db") + existing_data = doc.to_dict() + + # Update only the new sessions in eventSessions + existing_sessions = existing_data.get("eventSessions", {}) + updated_sessions = event.sessions.copy() + + # Skip existing sessions + for session_id in existing_sessions: + updated_sessions.pop(session_id, None) + + # Merge event data without eventSessions + event_dict = event.to_dict() + if updated_sessions: + existing_sessions.update( + { + session_id: session.to_dict() + for session_id, session in updated_sessions.items() + } + ) + event_dict["eventSessions"] = existing_sessions + else: + event_dict.pop("eventSessions", None) # Exclude if no new sessions + + # Merge attributes without overwriting eventSessions + doc_ref.set(event.to_dict_no_sessions(), merge=True) + else: - print(f"Adding event with pk {event.event_id_hex} in db") - doc_ref.set(event.to_dict_with_empty_users()) + print(f"Adding event with pk {event.event_id} in db") + doc_ref.set(event.to_dict()) + + +def update_event_details(event: CFAEvent): + """Update event details by scraping its detail page.""" + if not event.detail_url: + return + + response = requests.get(event.detail_url) + soup = BeautifulSoup(response.content, "html.parser") + raw_detail = scrape_detail_page(soup) + + event.photo = scrape_event_image(raw_detail) + event.description = scrape_event_description(raw_detail) + event.event_url = scrape_event_event_link(raw_detail) + + session_id = get_session_id_from_url(event.detail_url) + if session_id in event.sessions: + event.sessions[session_id].start_time, event.sessions[session_id].end_time = ( + scrape_session_datetime(raw_detail) + ) + + +def main(table_name): + """Main function to run the scraper.""" + db = initialize_firestore() + + print("Starting scraper") + + # Fetch and parse the event list page + soup = fetch_and_parse_url("https://www.bu.edu/cfa/news/bu-arts-initiative/") + cfa_events = scrape_events(soup) + + # Update event details for each scraped event + for event in cfa_events: + update_event_details(event) + + # Update the database with the events + update_database(db, cfa_events, table_name) + print("Event Scraping has completed") -main("test_events") + +main("new_events1")