From 7533f678d33830a0f55bbfebfdd4d3ade12bbf28 Mon Sep 17 00:00:00 2001 From: Dustin Carlino Date: Fri, 24 May 2024 11:35:49 +0100 Subject: [PATCH] Understand the day-of-week when parsing GTFS. For now, just keep Mondays. Prunes giant London from 1.3GB to 720MB --- backend/src/gtfs/ids.rs | 3 ++ backend/src/gtfs/scrape.rs | 78 +++++++++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/backend/src/gtfs/ids.rs b/backend/src/gtfs/ids.rs index 376f7d1..6ca1146 100644 --- a/backend/src/gtfs/ids.rs +++ b/backend/src/gtfs/ids.rs @@ -12,6 +12,9 @@ pub mod orig_ids { #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub struct TripID(String); + + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] + pub struct ServiceID(String); } #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] diff --git a/backend/src/gtfs/scrape.rs b/backend/src/gtfs/scrape.rs index ae4e06b..2684896 100644 --- a/backend/src/gtfs/scrape.rs +++ b/backend/src/gtfs/scrape.rs @@ -11,10 +11,54 @@ use super::ids::{orig_ids, IDMapping}; use super::{GtfsModel, NextStep, Stop, StopID, Trip, TripID}; use crate::graph::RoadID; +// Move to mod after deciding to store every day +#[derive(Clone, Copy, Debug, PartialEq)] +enum Day { + Monday = 0, + Tuesday = 1, + Wednesday = 2, + Thursday = 3, + Friday = 4, + Saturday = 5, + Sunday = 6, +} + impl GtfsModel { /// Takes a path to a GTFS directory pub fn parse(dir_path: &str, mercator: &Mercator) -> Result { - println!("Scraping stops.txt"); + info!("Scraping trips.txt"); + let mut trip_to_service: BTreeMap = BTreeMap::new(); + for rec in + csv::Reader::from_reader(File::open(format!("{dir_path}/trips.txt"))?).deserialize() + { + let rec: TripRow = rec?; + trip_to_service.insert(rec.trip_id, rec.service_id); + } + + info!("Scraping calendar.txt"); + let mut service_to_days: BTreeMap> = BTreeMap::new(); + for rec in + csv::Reader::from_reader(File::open(format!("{dir_path}/calendar.txt"))?).deserialize() + { + let rec: CalendarRow = rec?; + let mut days = Vec::new(); + for (day, include) in [ + (Day::Monday, rec.monday), + (Day::Tuesday, rec.tuesday), + (Day::Wednesday, rec.wednesday), + (Day::Thursday, rec.thursday), + (Day::Friday, rec.friday), + (Day::Saturday, rec.saturday), + (Day::Sunday, rec.sunday), + ] { + if include == 1 { + days.push(day); + } + } + service_to_days.insert(rec.service_id, days); + } + + info!("Scraping stops.txt"); let mut stop_ids: IDMapping = IDMapping::new(); let mut stops: Vec = Vec::new(); for rec in @@ -39,7 +83,7 @@ impl GtfsModel { }); } - println!("Scraping stop_times.txt"); + info!("Scraping stop_times.txt"); let mut trips_table: BTreeMap = BTreeMap::new(); for rec in csv::Reader::from_reader(File::open(format!("{dir_path}/stop_times.txt"))?) .deserialize() @@ -55,6 +99,18 @@ impl GtfsModel { continue; }; + // Which days does this stop occur on? + let service = &trip_to_service[&rec.trip_id]; + let Some(days) = service_to_days.get(service) else { + warn!("Don't know what days service {service:?} is on"); + continue; + }; + + // TODO For now, only keep Monday + if !days.contains(&Day::Monday) { + continue; + } + trips_table .entry(rec.trip_id) .or_insert_with(|| Trip { @@ -90,6 +146,24 @@ impl GtfsModel { } } +#[derive(Deserialize)] +struct TripRow { + trip_id: orig_ids::TripID, + service_id: orig_ids::ServiceID, +} + +#[derive(Deserialize)] +struct CalendarRow { + service_id: orig_ids::ServiceID, + monday: usize, + tuesday: usize, + wednesday: usize, + thursday: usize, + friday: usize, + saturday: usize, + sunday: usize, +} + #[derive(Deserialize)] struct StopRow { stop_id: orig_ids::StopID,