Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Tests for invalid data in GTFS dataset #115

Merged
merged 8 commits into from
Sep 20, 2023
20 changes: 20 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ def pytest_addoption(parser):
default=False,
help="run expensive tests",
)
parser.addoption(
"--sanitycheck",
action="store_true",
default=False,
help="run sanity checks",
)


def pytest_configure(config):
Expand All @@ -39,6 +45,10 @@ def pytest_configure(config):
config.addinivalue_line(
"markers", "runexpensive: mark test to run expensive tests"
)
config.addinivalue_line(
"markers",
"sanitycheck: mark test to run checks in dependencies' code.",
)


def pytest_collection_modifyitems(config, items): # noqa:C901
Expand All @@ -47,6 +57,7 @@ def pytest_collection_modifyitems(config, items): # noqa:C901
config.getoption("--runsetup")
& config.getoption("--runinteg")
& config.getoption("--runexpensive")
& config.getoption("--sanitycheck")
):
# do full test suite when all flags are given
return
Expand Down Expand Up @@ -75,3 +86,12 @@ def pytest_collection_modifyitems(config, items): # noqa:C901
for item in items:
if "runexpensive" in item.keywords:
item.add_marker(skip_runexpensive)

# do not add sanitycheck marks when the sanitycheck flag is given
if not config.getoption("--sanitycheck"):
skip_sanitycheck = pytest.mark.skip(
reason="need --sanitycheck option to run"
)
for item in items:
if "sanitycheck" in item.keywords:
item.add_marker(skip_sanitycheck)
9 changes: 5 additions & 4 deletions notebooks/gtfs/check_unmatched_id_warnings.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@
feed.trips,
pd.DataFrame(
{
"service_id": [101],
"route_id": [20304],
"service_id": ["101023"],
"route_id": ["2030445"],
"trip_id": ["VJbedb4cfd0673348e017d42435abbdff3ddacbf89"],
"trip_headsign": ["Newport"],
"block_id": [np.nan],
Expand All @@ -70,8 +70,7 @@
feed.routes,
pd.DataFrame(
{
"service_id": [101],
"route_id": [20304],
"route_id": ["20304"],
"agency_id": ["OL5060"],
"route_short_name": ["X145"],
"route_long_name": [np.nan],
Expand All @@ -91,3 +90,5 @@
# the pipeline and therefore the user will be made aware. It is also flagged
# as an error which means that 'the GTFS is violated'
# (https://mrcagney.github.io/gtfs_kit_docs/).

# %%
142 changes: 142 additions & 0 deletions tests/gtfs/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,148 @@ def test_is_valid(self, gtfs_fixture):
found_cols == exp_cols
).all(), f"Expected columns {exp_cols}. Found: {found_cols}"

@pytest.mark.sanitycheck
def test_trips_unmatched_ids(self, gtfs_fixture):
"""Tests to evaluate gtfs-klt's reaction to invalid IDs in trips.

Parameters
----------
gtfs_fixture : GtfsInstance
a GtfsInstance test fixure

"""
feed = gtfs_fixture.feed

# add row to tripas table with invald trip_id, route_id, service_id
feed.trips = pd.concat(
[
feed.trips,
pd.DataFrame(
{
"service_id": ["101023"],
"route_id": ["2030445"],
"trip_id": [
"VJbedb4cfd0673348e017d42435abbdff3ddacbf89"
],
"trip_headsign": ["Newport"],
"block_id": [np.nan],
"shape_id": [
"RPSPc4c99ac6aff7e4648cbbef785f88427a48efa80f"
],
"wheelchair_accessible": [0],
"trip_direction_name": [np.nan],
"vehicle_journey_code": ["VJ109"],
}
),
],
axis=0,
)

# assert different errors/warnings haave been raised
new_valid = feed.validate()
assert (
len(new_valid[new_valid.message == "Undefined route_id"]) == 1
), "gtfs-kit failed to recognise invalid route_id"
assert (
len(new_valid[new_valid.message == "Undefined service_id"]) == 1
), "gtfs-kit failed to recognise invalid service_id"
assert (
len(new_valid[new_valid.message == "Trip has no stop times"]) == 1
), "gtfs-kit failed to recognise invalid service_id"
assert len(new_valid) == 10, "Validation table not expected size"

@pytest.mark.sanitycheck
def test_routes_unmatched_ids(self, gtfs_fixture):
"""Tests to evaluate gtfs-klt's reaction to invalid IDs in routes.

Parameters
----------
gtfs_fixture : GtfsInstance
a GtfsInstance test fixure

"""
feed = gtfs_fixture.feed

# add row to tripas table with invald trip_id, route_id, service_id
feed.routes = pd.concat(
[
feed.routes,
pd.DataFrame(
{
"route_id": ["20304"],
"agency_id": ["OL5060"],
"route_short_name": ["X145"],
"route_long_name": [np.nan],
"route_type": [200],
}
),
],
axis=0,
)

# assert different errors/warnings haave been raised
new_valid = feed.validate()
assert (
len(new_valid[new_valid.message == "Undefined agency_id"]) == 1
), "gtfs-kit failed to recognise invalid agency_id"
assert (
len(new_valid[new_valid.message == "Route has no trips"]) == 1
), "gtfs-kit failed to recognise that there are routes with no trips"
assert len(new_valid) == 9, "Validation table not expected size"

@pytest.mark.sanitycheck
def test_unmatched_service_id_behaviour(self, gtfs_fixture):
"""Tests to evaluate gtfs-klt's reaction to invalid IDs in calendar.

Parameters
----------
gtfs_fixture : GtfsInstance
a GtfsInstance test fixure

Notes
-----
'gtfs-kit' does not care about unmatched service IDs in the calendar
table. The Calendar table can have data with any service_id as long as
the datatypes are string. However, gtfs_kit will state an error if the
calendar table contains duplicate service_ids.

"""
feed = gtfs_fixture.feed
original_error_count = len(feed.validate())

# introduce a dummy row with a non matching service_id
feed.calendar = pd.concat(
[
feed.calendar,
pd.DataFrame(
{
"service_id": ["1018872"],
"monday": [0],
"tuesday": [0],
"wednesday": [0],
"thursday": [0],
"friday": [0],
"saturday": [0],
"sunday": [0],
"start_date": ["20200104"],
"end_date": ["20230301"],
}
),
],
axis=0,
)
new_error_count = len(feed.validate())
assert (
new_error_count == original_error_count
), "Unrecognised error in validaation table"

# drop a row from the calendar table
feed.calendar.drop(3, inplace=True)
new_valid = feed.validate()
assert (
len(new_valid[new_valid.message == "Undefined service_id"]) == 1
), "gtfs-kit failed to identify missing service_id"

def test_print_alerts_defence(self, gtfs_fixture):
"""Check defensive behaviour of print_alerts()."""
with pytest.raises(
Expand Down