Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

74 130 clean feed and route type cleaner #195

Closed
wants to merge 39 commits into from
Closed
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
06aa699
Add validation for invalid route type warnings; update tests; add fun…
CBROWN-ONS Oct 16, 2023
a420098
Add docstring to private function
CBROWN-ONS Oct 16, 2023
37df68a
added _remove_validation_row function to gtfs_utils
CBROWN-ONS Oct 16, 2023
a4f452b
Resolve merge conflicts
CBROWN-ONS Oct 17, 2023
70ef720
Add tests for _get_validation_warnings
CBROWN-ONS Oct 17, 2023
449b445
Added tests for _remove_validation_row
CBROWN-ONS Oct 17, 2023
85e6631
Add tests for validate_route_type_warnings
CBROWN-ONS Oct 17, 2023
e9e3c9c
Add pipeline model for validating gtfs
CBROWN-ONS Oct 17, 2023
4ec8fcd
Merge branch 'dev' into 74-130-clean-feed-and-route-type-cleaner
CBROWN-ONS Oct 17, 2023
4f9802f
Fix tests to align with new validation pipeline
CBROWN-ONS Oct 17, 2023
9131720
Add tests for new is_valid functionality; add additional test for val…
CBROWN-ONS Oct 19, 2023
cd2a5c3
Add addtional test for is_valid()
CBROWN-ONS Oct 19, 2023
5f97b53
Add core cleaner that utilises gtfs-kit cleaners
CBROWN-ONS Oct 19, 2023
ad38cac
add _function_pipeline(); updaet clean_feed to use;update tests; impl…
CBROWN-ONS Oct 19, 2023
3cb7443
Add tests for core_cleaner
CBROWN-ONS Oct 19, 2023
d6e73d2
Update core_validation docstrings and type hinting; add accepted gtfs…
CBROWN-ONS Oct 23, 2023
3beafa3
add code for validate_gtfs_files()
CBROWN-ONS Oct 23, 2023
160d7b5
add tests for validte_gtfs_file()
CBROWN-ONS Oct 23, 2023
54acd6a
Add clean_unrecognised_column_warnings function
CBROWN-ONS Oct 23, 2023
be28415
add tests for clean_unregnised_column_warnings; add new cleaner to fu…
CBROWN-ONS Oct 23, 2023
4ce54c3
patch bug in html report generation
CBROWN-ONS Oct 24, 2023
a9ea856
merge with dev
CBROWN-ONS Oct 24, 2023
3fa8aa2
182: add type defences to gtfs_utils; add fast travel tables to GtfsI…
CBROWN-ONS Oct 24, 2023
606e25d
182: cleaners tech debt; add GtfsInstance attr 'units'; passing test …
CBROWN-ONS Oct 24, 2023
5c2cd9b
182: cleaners tech debt; use pd.isna() instead of math.isnan()
CBROWN-ONS Oct 24, 2023
d324dfc
Add tests for returning speed bound of unrecognised route_type
CBROWN-ONS Oct 26, 2023
c679d23
180: Functionalise part of fast travel cleaners; remove validate opti…
CBROWN-ONS Oct 26, 2023
8c16268
fix tests
CBROWN-ONS Oct 26, 2023
eb1e7c3
Add tests for clean_duplicate_stop_times
CBROWN-ONS Oct 30, 2023
539ff66
Improve coverage
CBROWN-ONS Oct 30, 2023
728f128
Merge with dev and resolve conflicts
CBROWN-ONS Nov 6, 2023
9f3cc25
chore: Up to date with dev
Nov 13, 2023
b2a2175
chore: minor typos in gtfs_utils.py
SergioRec Nov 14, 2023
a82c8f7
chore: minor typos in Test_AddValidationRow and Test_FilterGtfsAround…
SergioRec Nov 14, 2023
0068a61
fix: change wrong path in test
SergioRec Nov 14, 2023
f6f2b09
fix: merge with dev
CBROWN-ONS Feb 1, 2024
034c6fb
fix: update tests to better align with new cleaning/validation pipelines
CBROWN-ONS Feb 1, 2024
dd1c2a3
fix: update typo in function name
CBROWN-ONS Feb 5, 2024
f9ad23d
Merge branch 'dev' into 74-130-clean-feed-and-route-type-cleaner
CBROWN-ONS Feb 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
!tests/data/newport-2023-06-13.osm.pbf
!tests/data/chester-20230816-small_gtfs.zip
!tests/data/gtfs/newport-20230613_gtfs.zip
!tests/data/gtfs/repeated_pair_gtfs_fixture.zip
!src/transport_performance/data/gtfs/route_lookup.pkl
!tests/data/gtfs/report/html_template.html

Expand Down
241 changes: 185 additions & 56 deletions src/transport_performance/gtfs/cleaners.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,26 @@
"""A set of functions that clean the gtfs data."""
from typing import Union
import warnings

import numpy as np

from transport_performance.utils.defence import _gtfs_defence, _check_iterable
import pandas as pd
from gtfs_kit.cleaners import (
clean_ids as clean_ids_gk,
clean_route_short_names as clean_route_short_names_gk,
clean_times as clean_times_gk,
drop_zombies as drop_zombies_gk,
)

from transport_performance.gtfs.gtfs_utils import (
_get_validation_warnings,
_remove_validation_row,
)
from transport_performance.utils.defence import (
_gtfs_defence,
_check_iterable,
_type_defence,
_check_attribute,
)


def drop_trips(gtfs, trip_id: Union[str, list, np.ndarray]) -> None:
Expand Down Expand Up @@ -32,19 +49,21 @@ def drop_trips(gtfs, trip_id: Union[str, list, np.ndarray]) -> None:
if isinstance(trip_id, str):
trip_id = [trip_id]

# _check_iterable only takes lists, therefore convert numpy arrays
if isinstance(trip_id, np.ndarray):
trip_id = list(trip_id)

# ensure trip ids are string
_check_iterable(
iterable=trip_id,
param_nm="trip_id",
iterable_type=list,
iterable_type=type(trip_id),
check_elements=True,
exp_type=str,
)

# warn users if passed one of the passed trip_id's is not present in the
# GTFS.
for _id in trip_id:
if _id not in gtfs.feed.trips.trip_id.unique():
warnings.warn(UserWarning(f"trip_id '{_id}' not found in GTFS"))

# drop relevant records from tables
gtfs.feed.trips = gtfs.feed.trips[
~gtfs.feed.trips["trip_id"].isin(trip_id)
Expand All @@ -61,44 +80,58 @@ def drop_trips(gtfs, trip_id: Union[str, list, np.ndarray]) -> None:
return None


def clean_consecutive_stop_fast_travel_warnings(
gtfs, validate: bool = False
) -> None:
"""Clean 'Fast Travel Between Consecutive Stops' warnings from validity_df.
def _clean_fast_travel_preperation(gtfs, warning_re: str) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: should be "preparation". Not changing this as it'll probably require other changes in the codebase where this function is used.

"""Prepare to clean fast travel errors.

At the beggining of both of the fast travel cleaners, the gtfs is type
checked, attr checked and then warnings are obtained. Because of this, this
has been functionalised

Parameters
----------
gtfs : GtfsInstance
The GtfsInstance to clean warnings within
validate : bool, optional
Whether or not to validate the gtfs before carrying out this cleaning
operation
gtfs : _type_
The GtfsInstance.
warning_re : str
Regex used to obtain warnings.

Returns
-------
None
pd.DataFrame
A dataframe containing warnings.

"""
# defences
_gtfs_defence(gtfs, "gtfs")
if "validity_df" not in gtfs.__dict__.keys() and not validate:
raise AttributeError(
_type_defence(warning_re, "warning_re", str)
_check_attribute(
gtfs,
"validity_df",
message=(
"The gtfs has not been validated, therefore no"
"warnings can be identified. You can pass "
"validate=True to this function to validate the "
"gtfs."
)
),
)
needed_warning = _get_validation_warnings(gtfs, warning_re)
return needed_warning


if validate:
gtfs.is_valid()
def clean_consecutive_stop_fast_travel_warnings(gtfs) -> None:
"""Clean 'Fast Travel Between Consecutive Stops' warnings from validity_df.
Copy link
Contributor

@SergioRec SergioRec Nov 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When running this function in the Wales GTFS, it seems that it also purges instances of fast travel between multiple stops.

So when running:

gtfs.is_valid()
clean_consecutive_stop_fast_travel_warnings(gtfs)
gtfs.is_valid()

It gets rid of all Fast Travel Between Consecutive Stops warnings as well as all Fast Travel Over Multiple Stops warnings. This causes that running clean_multiple_stop_fast_travel_warnings after clean_consecutive_stop_fast_travel_warnings has no effect. I'm not sure if there may be instances where this is not the case, but I've tried it with GTFS from Wales, Scotland and London.

Considering this, it is unlikely that we'll run clean_consecutive_stop_fast_travel_warnings in isolation, as it could leave some problematic trips undetected. Would it make sense to merge them into a single function and apply them sequentially? Or run clean_multiple_stop_fast_travel_warnings by default, and have an optional flag to also run the other step if requested, always after the former? This way we would be refactoring these two functions into a single one and ensuring cleaning steps are applied in the right order.


needed_warning = (
gtfs.validity_df[
gtfs.validity_df["message"]
== "Fast Travel Between Consecutive Stops"
]
.copy()
.values
Parameters
----------
gtfs : GtfsInstance
The GtfsInstance to clean warnings within

Returns
-------
None

"""
# defences
needed_warning = _clean_fast_travel_preperation(
gtfs, "Fast Travel Between Consecutive Stops"
)

if len(needed_warning) < 1:
Expand All @@ -116,45 +149,22 @@ def clean_consecutive_stop_fast_travel_warnings(
return None


def clean_multiple_stop_fast_travel_warnings(
gtfs, validate: bool = False
) -> None:
def clean_multiple_stop_fast_travel_warnings(gtfs) -> None:
"""Clean 'Fast Travel Over Multiple Stops' warnings from validity_df.

Parameters
----------
gtfs : GtfsInstance
The GtfsInstance to clean warnings within
validate : bool, optional
Whether or not to validate the gtfs before carrying out this cleaning
operation

Returns
-------
None

"""
# defences
_gtfs_defence(gtfs, "gtfs")
if "validity_df" not in gtfs.__dict__.keys() and not validate:
raise AttributeError(
"The gtfs has not been validated, therefore no"
"warnings can be identified. You can pass "
"validate=True to this function to validate the "
"gtfs."
)

if validate:
gtfs.is_valid()

needed_warning = (
gtfs.validity_df[
gtfs.validity_df["message"] == "Fast Travel Over Multiple Stops"
]
.copy()
.values
needed_warning = _clean_fast_travel_preperation(
gtfs, "Fast Travel Over Multiple Stops"
)

if len(needed_warning) < 1:
return None

Expand All @@ -168,3 +178,122 @@ def clean_multiple_stop_fast_travel_warnings(
~gtfs.multiple_stops_invalid["trip_id"].isin(trip_ids)
]
return None


def core_cleaners(
gtfs,
clean_ids: bool = True,
clean_times: bool = True,
clean_route_short_names: bool = True,
drop_zombies: bool = True,
) -> None:
"""Clean the gtfs with the core cleaners of gtfs-kit.

The source code for the cleaners, along with detailed descriptions of the
cleaning they are performing can be found here:
https://github.com/mrcagney/gtfs_kit/blob/master/gtfs_kit/cleaners.py

All credit for these cleaners goes to the creators of the gtfs_kit package.
HOMEPAGE: https://github.com/mrcagney/gtfs_kit

Parameters
----------
gtfs : GtfsInstance
The gtfs to clean
clean_ids : bool, optional
Whether or not to use clean_ids, by default True
clean_times : bool, optional
Whether or not to use clean_times, by default True
clean_route_short_names : bool, optional
Whether or not to use clean_route_short_names, by default True
drop_zombies : bool, optional
Whether or not to use drop_zombies, by default True

Returns
-------
None

"""
# defences
_gtfs_defence(gtfs, "gtfs")
_type_defence(clean_ids, "clean_ids", bool)
_type_defence(clean_times, "clean_times", bool)
_type_defence(clean_route_short_names, "clean_route_short_names", bool)
_type_defence(drop_zombies, "drop_zombies", bool)
# cleaning
if clean_ids:
clean_ids_gk(gtfs.feed)
if clean_times:
clean_times_gk(gtfs.feed)
if clean_route_short_names:
clean_route_short_names_gk(gtfs.feed)
if drop_zombies:
try:
drop_zombies_gk(gtfs.feed)
except KeyError:
warnings.warn(
UserWarning(
"The drop_zombies cleaner was unable to operate on "
"clean_feed as the trips table has no shape_id column"
)
)
return None


def clean_unrecognised_column_warnings(gtfs) -> None:
"""Clean warnings for unrecognised columns.

Parameters
----------
gtfs : GtfsInstance
The GtfsInstance to clean warnings from

Returns
-------
None

"""
_gtfs_defence(gtfs, "gtfs")
warnings = _get_validation_warnings(
gtfs=gtfs, message="Unrecognized column .*"
)
for warning in warnings:
tbl = gtfs.table_map[warning[2]]
# parse column from warning message
column = warning[1].split("column")[1].strip()
tbl.drop(column, inplace=True, axis=1)
_remove_validation_row(gtfs, warning[1])
return None


def clean_duplicate_stop_times(gtfs) -> None:
"""Clean duplicates from stop_times with repeated pair (trip_id, ...

departure_time.

Parameters
----------
gtfs : GtfsInstance
The gtfs to clean

Returns
-------
None

"""
_gtfs_defence(gtfs, "gtfs")
warning_re = r".* \(trip_id, departure_time\)"
# we are only expecting one warning here
warning = _get_validation_warnings(gtfs, warning_re)
if len(warning) == 0:
return None
warning = warning[0]
# drop from actual table
gtfs.table_map[warning[2]].drop_duplicates(
subset=["arrival_time", "departure_time", "trip_id", "stop_id"],
inplace=True,
)
_remove_validation_row(gtfs, message=warning_re)
# re-validate with gtfs-kit validator
gtfs.is_valid({"core_validation": None})
return None
Loading