-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
74 130 clean feed and route type cleaner #195
Changes from 35 commits
06aa699
a420098
37df68a
a4f452b
70ef720
449b445
85e6631
e9e3c9c
4ec8fcd
4f9802f
9131720
cd2a5c3
5f97b53
ad38cac
3cb7443
d6e73d2
3beafa3
160d7b5
54acd6a
be28415
4ce54c3
a9ea856
3fa8aa2
606e25d
5c2cd9b
d324dfc
c679d23
8c16268
eb1e7c3
539ff66
728f128
9f3cc25
b2a2175
a82c8f7
0068a61
f6f2b09
034c6fb
dd1c2a3
f9ad23d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,26 @@ | ||
"""A set of functions that clean the gtfs data.""" | ||
from typing import Union | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from transport_performance.utils.defence import _gtfs_defence, _check_iterable | ||
import pandas as pd | ||
from gtfs_kit.cleaners import ( | ||
clean_ids as clean_ids_gk, | ||
clean_route_short_names as clean_route_short_names_gk, | ||
clean_times as clean_times_gk, | ||
drop_zombies as drop_zombies_gk, | ||
) | ||
|
||
from transport_performance.gtfs.gtfs_utils import ( | ||
_get_validation_warnings, | ||
_remove_validation_row, | ||
) | ||
from transport_performance.utils.defence import ( | ||
_gtfs_defence, | ||
_check_iterable, | ||
_type_defence, | ||
_check_attribute, | ||
) | ||
|
||
|
||
def drop_trips(gtfs, trip_id: Union[str, list, np.ndarray]) -> None: | ||
|
@@ -32,19 +49,21 @@ def drop_trips(gtfs, trip_id: Union[str, list, np.ndarray]) -> None: | |
if isinstance(trip_id, str): | ||
trip_id = [trip_id] | ||
|
||
# _check_iterable only takes lists, therefore convert numpy arrays | ||
if isinstance(trip_id, np.ndarray): | ||
trip_id = list(trip_id) | ||
|
||
# ensure trip ids are string | ||
_check_iterable( | ||
iterable=trip_id, | ||
param_nm="trip_id", | ||
iterable_type=list, | ||
iterable_type=type(trip_id), | ||
check_elements=True, | ||
exp_type=str, | ||
) | ||
|
||
# warn users if passed one of the passed trip_id's is not present in the | ||
# GTFS. | ||
for _id in trip_id: | ||
if _id not in gtfs.feed.trips.trip_id.unique(): | ||
warnings.warn(UserWarning(f"trip_id '{_id}' not found in GTFS")) | ||
|
||
# drop relevant records from tables | ||
gtfs.feed.trips = gtfs.feed.trips[ | ||
~gtfs.feed.trips["trip_id"].isin(trip_id) | ||
|
@@ -61,44 +80,58 @@ def drop_trips(gtfs, trip_id: Union[str, list, np.ndarray]) -> None: | |
return None | ||
|
||
|
||
def clean_consecutive_stop_fast_travel_warnings( | ||
gtfs, validate: bool = False | ||
) -> None: | ||
"""Clean 'Fast Travel Between Consecutive Stops' warnings from validity_df. | ||
def _clean_fast_travel_preperation(gtfs, warning_re: str) -> pd.DataFrame: | ||
"""Prepare to clean fast travel errors. | ||
|
||
At the beggining of both of the fast travel cleaners, the gtfs is type | ||
checked, attr checked and then warnings are obtained. Because of this, this | ||
has been functionalised | ||
|
||
Parameters | ||
---------- | ||
gtfs : GtfsInstance | ||
The GtfsInstance to clean warnings within | ||
validate : bool, optional | ||
Whether or not to validate the gtfs before carrying out this cleaning | ||
operation | ||
gtfs : _type_ | ||
The GtfsInstance. | ||
warning_re : str | ||
Regex used to obtain warnings. | ||
|
||
Returns | ||
------- | ||
None | ||
pd.DataFrame | ||
A dataframe containing warnings. | ||
|
||
""" | ||
# defences | ||
_gtfs_defence(gtfs, "gtfs") | ||
if "validity_df" not in gtfs.__dict__.keys() and not validate: | ||
raise AttributeError( | ||
_type_defence(warning_re, "warning_re", str) | ||
_check_attribute( | ||
gtfs, | ||
"validity_df", | ||
message=( | ||
"The gtfs has not been validated, therefore no" | ||
"warnings can be identified. You can pass " | ||
"validate=True to this function to validate the " | ||
"gtfs." | ||
) | ||
), | ||
) | ||
needed_warning = _get_validation_warnings(gtfs, warning_re) | ||
return needed_warning | ||
|
||
|
||
if validate: | ||
gtfs.is_valid() | ||
def clean_consecutive_stop_fast_travel_warnings(gtfs) -> None: | ||
"""Clean 'Fast Travel Between Consecutive Stops' warnings from validity_df. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When running this function in the Wales GTFS, it seems that it also purges instances of fast travel between multiple stops. So when running:
It gets rid of all Considering this, it is unlikely that we'll run |
||
|
||
needed_warning = ( | ||
gtfs.validity_df[ | ||
gtfs.validity_df["message"] | ||
== "Fast Travel Between Consecutive Stops" | ||
] | ||
.copy() | ||
.values | ||
Parameters | ||
---------- | ||
gtfs : GtfsInstance | ||
The GtfsInstance to clean warnings within | ||
|
||
Returns | ||
------- | ||
None | ||
|
||
""" | ||
# defences | ||
needed_warning = _clean_fast_travel_preperation( | ||
gtfs, "Fast Travel Between Consecutive Stops" | ||
) | ||
|
||
if len(needed_warning) < 1: | ||
|
@@ -116,45 +149,22 @@ def clean_consecutive_stop_fast_travel_warnings( | |
return None | ||
|
||
|
||
def clean_multiple_stop_fast_travel_warnings( | ||
gtfs, validate: bool = False | ||
) -> None: | ||
def clean_multiple_stop_fast_travel_warnings(gtfs) -> None: | ||
"""Clean 'Fast Travel Over Multiple Stops' warnings from validity_df. | ||
|
||
Parameters | ||
---------- | ||
gtfs : GtfsInstance | ||
The GtfsInstance to clean warnings within | ||
validate : bool, optional | ||
Whether or not to validate the gtfs before carrying out this cleaning | ||
operation | ||
|
||
Returns | ||
------- | ||
None | ||
|
||
""" | ||
# defences | ||
_gtfs_defence(gtfs, "gtfs") | ||
if "validity_df" not in gtfs.__dict__.keys() and not validate: | ||
raise AttributeError( | ||
"The gtfs has not been validated, therefore no" | ||
"warnings can be identified. You can pass " | ||
"validate=True to this function to validate the " | ||
"gtfs." | ||
) | ||
|
||
if validate: | ||
gtfs.is_valid() | ||
|
||
needed_warning = ( | ||
gtfs.validity_df[ | ||
gtfs.validity_df["message"] == "Fast Travel Over Multiple Stops" | ||
] | ||
.copy() | ||
.values | ||
needed_warning = _clean_fast_travel_preperation( | ||
gtfs, "Fast Travel Over Multiple Stops" | ||
) | ||
|
||
if len(needed_warning) < 1: | ||
return None | ||
|
||
|
@@ -168,3 +178,122 @@ def clean_multiple_stop_fast_travel_warnings( | |
~gtfs.multiple_stops_invalid["trip_id"].isin(trip_ids) | ||
] | ||
return None | ||
|
||
|
||
def core_cleaners( | ||
gtfs, | ||
clean_ids: bool = True, | ||
clean_times: bool = True, | ||
clean_route_short_names: bool = True, | ||
drop_zombies: bool = True, | ||
) -> None: | ||
"""Clean the gtfs with the core cleaners of gtfs-kit. | ||
|
||
The source code for the cleaners, along with detailed descriptions of the | ||
cleaning they are performing can be found here: | ||
https://github.com/mrcagney/gtfs_kit/blob/master/gtfs_kit/cleaners.py | ||
|
||
All credit for these cleaners goes to the creators of the gtfs_kit package. | ||
HOMEPAGE: https://github.com/mrcagney/gtfs_kit | ||
|
||
Parameters | ||
---------- | ||
gtfs : GtfsInstance | ||
The gtfs to clean | ||
clean_ids : bool, optional | ||
Whether or not to use clean_ids, by default True | ||
clean_times : bool, optional | ||
Whether or not to use clean_times, by default True | ||
clean_route_short_names : bool, optional | ||
Whether or not to use clean_route_short_names, by default True | ||
drop_zombies : bool, optional | ||
Whether or not to use drop_zombies, by default True | ||
|
||
Returns | ||
------- | ||
None | ||
|
||
""" | ||
# defences | ||
_gtfs_defence(gtfs, "gtfs") | ||
_type_defence(clean_ids, "clean_ids", bool) | ||
_type_defence(clean_times, "clean_times", bool) | ||
_type_defence(clean_route_short_names, "clean_route_short_names", bool) | ||
_type_defence(drop_zombies, "drop_zombies", bool) | ||
# cleaning | ||
if clean_ids: | ||
clean_ids_gk(gtfs.feed) | ||
if clean_times: | ||
clean_times_gk(gtfs.feed) | ||
if clean_route_short_names: | ||
clean_route_short_names_gk(gtfs.feed) | ||
if drop_zombies: | ||
try: | ||
drop_zombies_gk(gtfs.feed) | ||
except KeyError: | ||
warnings.warn( | ||
UserWarning( | ||
"The drop_zombies cleaner was unable to operate on " | ||
"clean_feed as the trips table has no shape_id column" | ||
) | ||
) | ||
return None | ||
|
||
|
||
def clean_unrecognised_column_warnings(gtfs) -> None: | ||
"""Clean warnings for unrecognised columns. | ||
|
||
Parameters | ||
---------- | ||
gtfs : GtfsInstance | ||
The GtfsInstance to clean warnings from | ||
|
||
Returns | ||
------- | ||
None | ||
|
||
""" | ||
_gtfs_defence(gtfs, "gtfs") | ||
warnings = _get_validation_warnings( | ||
gtfs=gtfs, message="Unrecognized column .*" | ||
) | ||
for warning in warnings: | ||
tbl = gtfs.table_map[warning[2]] | ||
# parse column from warning message | ||
column = warning[1].split("column")[1].strip() | ||
tbl.drop(column, inplace=True, axis=1) | ||
_remove_validation_row(gtfs, warning[1]) | ||
return None | ||
|
||
|
||
def clean_duplicate_stop_times(gtfs) -> None: | ||
"""Clean duplicates from stop_times with repeated pair (trip_id, ... | ||
|
||
departure_time. | ||
|
||
Parameters | ||
---------- | ||
gtfs : GtfsInstance | ||
The gtfs to clean | ||
|
||
Returns | ||
------- | ||
None | ||
|
||
""" | ||
_gtfs_defence(gtfs, "gtfs") | ||
warning_re = r".* \(trip_id, departure_time\)" | ||
# we are only expecting one warning here | ||
warning = _get_validation_warnings(gtfs, warning_re) | ||
if len(warning) == 0: | ||
return None | ||
warning = warning[0] | ||
# drop from actual table | ||
gtfs.table_map[warning[2]].drop_duplicates( | ||
subset=["arrival_time", "departure_time", "trip_id", "stop_id"], | ||
inplace=True, | ||
) | ||
_remove_validation_row(gtfs, message=warning_re) | ||
# re-validate with gtfs-kit validator | ||
gtfs.is_valid({"core_validation": None}) | ||
return None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo: should be "preparation". Not changing this as it'll probably require other changes in the codebase where this function is used.