Skip to content

Commit

Permalink
Merge pull request #1095 from dirac-institute/cvs_robustness
Browse files Browse the repository at this point in the history
Improve error messaging in CSVReader
  • Loading branch information
mschwamb authored Jan 13, 2025
2 parents 9e4594b + 75b2fdf commit 06f2a44
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 14 deletions.
64 changes: 52 additions & 12 deletions src/sorcha/readers/CSVReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(self, filename, sep="csv", header=-1, **kwargs):
sys.exit(f"ERROR: Unrecognized delimiter ({sep})")
self.sep = sep

# To pre-validation and collect the header information.
self.header_row = self._find_and_validate_header_line(header)

# A table holding just the object ID for each row. Only populated
Expand Down Expand Up @@ -131,6 +132,37 @@ def _check_header_line(self, header_line):
pplogger.error(error_str)
sys.exit(error_str)

def _validate_csv(self, header):
"""Perform a validation of the CSV file, such as checking for blank lines.
This is an expensive test and should only be performed when something
has gone wrong. This is needed because panda's read_csv() function can
given vague errors (such as failing with an index error if the file
has blank lines at the end).
Parameters
----------
header : integer
The row number of the header.
Returns
-------
: bool
True indicating success.
"""
pplogger = logging.getLogger(__name__)

with open(self.filename) as fh:
for i, line in enumerate(fh):
if i >= header:
# Check for blank lines. We do this explicitly because pandas read_csv()
# has problems when skipping lines and finding blank lines at the end.
if len(line) == 0 or line.isspace():
error_str = f"ERROR: CSVReader: found a blank line on line {i} of {self.filename}."
pplogger.error(error_str)
sys.exit(error_str)
return True

def _read_rows_internal(self, block_start=0, block_size=None, **kwargs):
"""Reads in a set number of rows from the input.
Expand Down Expand Up @@ -177,6 +209,7 @@ def _read_rows_internal(self, block_start=0, block_size=None, **kwargs):
skiprows=skip_rows,
nrows=block_size,
)

return res_df

def _build_id_map(self):
Expand Down Expand Up @@ -225,18 +258,25 @@ def _read_objects_internal(self, obj_ids, **kwargs):
skipped_row.extend(~self.obj_id_table["ObjID"].isin(obj_ids).values)

# Read the rows.
if self.sep == "whitespace":
res_df = pd.read_csv(
self.filename,
sep="\\s+",
skiprows=(lambda x: skipped_row[x]),
)
else:
res_df = pd.read_csv(
self.filename,
delimiter=",",
skiprows=(lambda x: skipped_row[x]),
)
try:
if self.sep == "whitespace":
res_df = pd.read_csv(
self.filename,
sep="\\s+",
skiprows=(lambda x: skipped_row[x]),
)
else:
res_df = pd.read_csv(
self.filename,
delimiter=",",
skiprows=(lambda x: skipped_row[x]),
)
except IndexError as current_exc:
# Check if there is a more understandable error we can raise.
self._validate_csv(self.header_row)

# If we do not detect a problem with _validate_csv, reraise the error.
raise current_exc
return res_df

def _process_and_validate_input_table(self, input_table, **kwargs):
Expand Down
29 changes: 29 additions & 0 deletions tests/readers/test_CSVReader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import numpy as np
import os
import pandas as pd
import pytest
from numpy.testing import assert_equal
from pandas.testing import assert_frame_equal
import tempfile

from sorcha.readers.CSVReader import CSVDataReader
from sorcha.utilities.dataUtilitiesForTests import get_test_filepath
Expand Down Expand Up @@ -392,3 +394,30 @@ def test_CSVDataReader_delims():
with pytest.raises(SystemExit) as e2:
_ = CSVDataReader(get_test_filepath("testcolour.txt"), "")
assert e2.type == SystemExit


def test_CSVDataReader_blank_lines():
"""Test that we fail if the input file has blank lines."""
with tempfile.TemporaryDirectory() as dir_name:
file_name = os.path.join(dir_name, "test.ecsv")
with open(file_name, "w") as output:
output.write("# My comment\n")
output.write("ObjID,b,c\n")
output.write("0,1,2\n")
output.write("1,1,2\n")
output.write("2,1,2\n")

# The checks pass.
reader = CSVDataReader(file_name, sep="csv", cache_table=False)
data = reader.read_objects(["1", "2"])
assert len(data) == 2

with open(file_name, "a") as output:
output.write("3,1,2\n")
output.write("\n") # add another blank line
output.write("\n") # add another blank line

# The code now fails by default.
reader2 = CSVDataReader(file_name, sep="csv", cache_table=False)
with pytest.raises(SystemExit):
_ = reader2.read_objects(["1", "2"])
3 changes: 1 addition & 2 deletions tests/readers/test_OrbitAuxReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ def test_orbit_reader_wrong_delim():
with pytest.raises(SystemExit) as err:
_ = OrbitAuxReader(get_test_filepath("testorb.des"), "csv")


def test_orbit_sanity_check_kep():
"""If an orbit parameter is undefined, raise an exception"""

Expand Down Expand Up @@ -335,7 +336,6 @@ def test_orbit_sanity_check_kep():
assert err.value.code == "ERROR: Invalid Keplerian elements detected for one or more objects (check log for information)"



def test_orbit_sanity_check_com():
"""If an orbit parameter is undefined, raise an exception"""

Expand All @@ -350,4 +350,3 @@ def test_orbit_sanity_check_com():
_ = OrbitAuxReader(get_test_filepath("orbit_test_files/orbit_sanity_check_com_e<0.csv"),"csv")
_.read_rows()
assert err.value.code == "ERROR: Invalid cometary elements detected for one or more objects (check log for information)"

0 comments on commit 06f2a44

Please sign in to comment.