From 75d6d5f7dce1457f605a95b127d1505b795859be Mon Sep 17 00:00:00 2001 From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com> Date: Fri, 10 Jan 2025 14:27:45 -0500 Subject: [PATCH 1/5] Add blank line checking to the CSV reader --- src/sorcha/readers/CSVReader.py | 33 +++++++++++++++++++++++++++- tests/readers/test_CSVReader.py | 24 ++++++++++++++++++++ tests/readers/test_OrbitAuxReader.py | 5 ++--- 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/sorcha/readers/CSVReader.py b/src/sorcha/readers/CSVReader.py index e74e8b36..f108bef5 100755 --- a/src/sorcha/readers/CSVReader.py +++ b/src/sorcha/readers/CSVReader.py @@ -12,7 +12,7 @@ class CSVDataReader(ObjectDataReader): Requires that the file's first column is ObjID. """ - def __init__(self, filename, sep="csv", header=-1, **kwargs): + def __init__(self, filename, sep="csv", header=-1, full_checks=True, **kwargs): """A class for reading the object data from a CSV file. Parameters @@ -28,6 +28,11 @@ def __init__(self, filename, sep="csv", header=-1, **kwargs): The row number of the header. If not provided, does an automatic search. Default = -1 + full_checks : bool, optional + Scan the file pre-validating the format. This may be expensive for + larger files. + Default = True + **kwargs: dictionary, optional Extra arguments """ @@ -40,7 +45,10 @@ def __init__(self, filename, sep="csv", header=-1, **kwargs): sys.exit(f"ERROR: Unrecognized delimiter ({sep})") self.sep = sep + # To pre-validation and collect the header information. self.header_row = self._find_and_validate_header_line(header) + if full_checks: + self._prevalidate_csv(self.header_row) # A table holding just the object ID for each row. Only populated # if we try to read data for specific object IDs. @@ -131,6 +139,29 @@ def _check_header_line(self, header_line): pplogger.error(error_str) sys.exit(error_str) + def _prevalidate_csv(self, header): + """Perform a pre-validation of the CSV file, such as checking + for blank lines. + + Parameters + ---------- + header : integer + The row number of the header. + """ + pplogger = logging.getLogger(__name__) + + with open(self.filename) as fh: + for i, line in enumerate(fh): + if i >= header: + # Check for blank lines. We do this explicitly because pandas read_csv() + # has problems when skipping lines and finding blank lines at the end. + if len(line) == 0 or line.isspace(): + error_str = ( + f"ERROR: CSVReader: found a blank line on line {i} " f" of {self.filename}." + ) + pplogger.error(error_str) + sys.exit(error_str) + def _read_rows_internal(self, block_start=0, block_size=None, **kwargs): """Reads in a set number of rows from the input. diff --git a/tests/readers/test_CSVReader.py b/tests/readers/test_CSVReader.py index 19b23090..b3a438c1 100755 --- a/tests/readers/test_CSVReader.py +++ b/tests/readers/test_CSVReader.py @@ -1,8 +1,10 @@ import numpy as np +import os import pandas as pd import pytest from numpy.testing import assert_equal from pandas.testing import assert_frame_equal +import tempfile from sorcha.readers.CSVReader import CSVDataReader from sorcha.utilities.dataUtilitiesForTests import get_test_filepath @@ -392,3 +394,25 @@ def test_CSVDataReader_delims(): with pytest.raises(SystemExit) as e2: _ = CSVDataReader(get_test_filepath("testcolour.txt"), "") assert e2.type == SystemExit + + +def test_CSVDataReader_blank_lines(): + """Test that we fail if the input file has blank lines.""" + with tempfile.TemporaryDirectory() as dir_name: + file_name = os.path.join(dir_name, "test.csv") + with open(file_name, 'w') as output: + output.write("ObjID,b,c\n") + output.write("0,1,2\n") + output.write("1,2,3\n") + + # The checks pass. + reader = CSVDataReader(file_name, sep="csv", full_checks=False) + with open(file_name, 'a') as output: + output.write("\n") # add a blank line + + # The code now fails by default. + with pytest.raises(SystemExit): + _ = CSVDataReader(file_name, sep="csv") + + # Checking can be turned off. + reader = CSVDataReader(file_name, sep="csv", full_checks=False) diff --git a/tests/readers/test_OrbitAuxReader.py b/tests/readers/test_OrbitAuxReader.py index 2938e4dd..9be75cbf 100755 --- a/tests/readers/test_OrbitAuxReader.py +++ b/tests/readers/test_OrbitAuxReader.py @@ -343,11 +343,10 @@ def test_orbit_sanity_check_com(): with pytest.raises(SystemExit) as err: _ = OrbitAuxReader(get_test_filepath("orbit_test_files/orbit_sanity_check_com_q<0.csv"),"csv") _.read_rows() - assert err.value.code == "ERROR: Invalid cometary elements detected for one or more objects (check log for information)" + assert err.value.code.startswith("ERROR: CSVReader: found a blank line") #for comentary e<0 with pytest.raises(SystemExit) as err: _ = OrbitAuxReader(get_test_filepath("orbit_test_files/orbit_sanity_check_com_e<0.csv"),"csv") _.read_rows() - assert err.value.code == "ERROR: Invalid cometary elements detected for one or more objects (check log for information)" - + assert err.value.code.startswith("ERROR: CSVReader: found a blank line") From 62d1e71c3abdc00785d3736640618ecde4549a85 Mon Sep 17 00:00:00 2001 From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:03:03 -0500 Subject: [PATCH 2/5] Revert previous changes --- src/sorcha/readers/CSVReader.py | 27 +++++++++++++++++---------- tests/readers/test_CSVReader.py | 24 +++++++++++++++--------- tests/readers/test_OrbitAuxReader.py | 4 ++-- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/src/sorcha/readers/CSVReader.py b/src/sorcha/readers/CSVReader.py index f108bef5..171a8207 100755 --- a/src/sorcha/readers/CSVReader.py +++ b/src/sorcha/readers/CSVReader.py @@ -12,7 +12,7 @@ class CSVDataReader(ObjectDataReader): Requires that the file's first column is ObjID. """ - def __init__(self, filename, sep="csv", header=-1, full_checks=True, **kwargs): + def __init__(self, filename, sep="csv", header=-1, **kwargs): """A class for reading the object data from a CSV file. Parameters @@ -28,11 +28,6 @@ def __init__(self, filename, sep="csv", header=-1, full_checks=True, **kwargs): The row number of the header. If not provided, does an automatic search. Default = -1 - full_checks : bool, optional - Scan the file pre-validating the format. This may be expensive for - larger files. - Default = True - **kwargs: dictionary, optional Extra arguments """ @@ -47,8 +42,6 @@ def __init__(self, filename, sep="csv", header=-1, full_checks=True, **kwargs): # To pre-validation and collect the header information. self.header_row = self._find_and_validate_header_line(header) - if full_checks: - self._prevalidate_csv(self.header_row) # A table holding just the object ID for each row. Only populated # if we try to read data for specific object IDs. @@ -139,14 +132,19 @@ def _check_header_line(self, header_line): pplogger.error(error_str) sys.exit(error_str) - def _prevalidate_csv(self, header): - """Perform a pre-validation of the CSV file, such as checking + def _validate_csv(self, header): + """Perform a validation of the CSV file, such as checking for blank lines. Parameters ---------- header : integer The row number of the header. + + Returns + ------- + : bool + True indicating success. """ pplogger = logging.getLogger(__name__) @@ -161,6 +159,7 @@ def _prevalidate_csv(self, header): ) pplogger.error(error_str) sys.exit(error_str) + return True def _read_rows_internal(self, block_start=0, block_size=None, **kwargs): """Reads in a set number of rows from the input. @@ -194,6 +193,7 @@ def _read_rows_internal(self, block_start=0, block_size=None, **kwargs): skip_rows.extend([i for i in range(self.header_row + 1, self.header_row + 1 + block_start)]) # Read the rows. + #try: if self.sep == "whitespace": res_df = pd.read_csv( self.filename, @@ -208,6 +208,13 @@ def _read_rows_internal(self, block_start=0, block_size=None, **kwargs): skiprows=skip_rows, nrows=block_size, ) + #except IndexError as current_exc: + # # Check if there is a more understandable error we can raise. + # self._validate_csv(self.header_row) + # + # # If we do not detect the a problem with _validate_csv, reraise the error. + # raise current_exc + return res_df def _build_id_map(self): diff --git a/tests/readers/test_CSVReader.py b/tests/readers/test_CSVReader.py index b3a438c1..e7d143e5 100755 --- a/tests/readers/test_CSVReader.py +++ b/tests/readers/test_CSVReader.py @@ -399,20 +399,26 @@ def test_CSVDataReader_delims(): def test_CSVDataReader_blank_lines(): """Test that we fail if the input file has blank lines.""" with tempfile.TemporaryDirectory() as dir_name: - file_name = os.path.join(dir_name, "test.csv") + file_name = os.path.join(dir_name, "test.ecsv") with open(file_name, 'w') as output: + output.write("# My comment\n") output.write("ObjID,b,c\n") output.write("0,1,2\n") - output.write("1,2,3\n") + output.write("1,1,2\n") + output.write("2,1,2\n") # The checks pass. - reader = CSVDataReader(file_name, sep="csv", full_checks=False) + reader = CSVDataReader(file_name, sep="csv", cache_table=False) + data = reader.read_objects([1, 2]) + assert len(data) == 2 + with open(file_name, 'a') as output: - output.write("\n") # add a blank line + output.write("3,1,2\n") + output.write("\n") # add another blank line + output.write("\n") # add another blank line + output.write("\n") # add another blank line # The code now fails by default. - with pytest.raises(SystemExit): - _ = CSVDataReader(file_name, sep="csv") - - # Checking can be turned off. - reader = CSVDataReader(file_name, sep="csv", full_checks=False) + reader2 = CSVDataReader(file_name, sep="csv", cache_table=False) + data2 = reader2.read_objects([1, 2]) + assert len(data2) == 2 diff --git a/tests/readers/test_OrbitAuxReader.py b/tests/readers/test_OrbitAuxReader.py index 9be75cbf..66d0cd96 100755 --- a/tests/readers/test_OrbitAuxReader.py +++ b/tests/readers/test_OrbitAuxReader.py @@ -343,10 +343,10 @@ def test_orbit_sanity_check_com(): with pytest.raises(SystemExit) as err: _ = OrbitAuxReader(get_test_filepath("orbit_test_files/orbit_sanity_check_com_q<0.csv"),"csv") _.read_rows() - assert err.value.code.startswith("ERROR: CSVReader: found a blank line") + assert err.value.code == "ERROR: Invalid cometary elements detected for one or more objects (check log for information)" #for comentary e<0 with pytest.raises(SystemExit) as err: _ = OrbitAuxReader(get_test_filepath("orbit_test_files/orbit_sanity_check_com_e<0.csv"),"csv") _.read_rows() - assert err.value.code.startswith("ERROR: CSVReader: found a blank line") + assert err.value.code == "ERROR: Invalid cometary elements detected for one or more objects (check log for information)" From 8bf96c016c54a5331ec491ed4803e48f2d068484 Mon Sep 17 00:00:00 2001 From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:34:09 -0500 Subject: [PATCH 3/5] Only execute check if read_csv fails --- src/sorcha/readers/CSVReader.py | 52 +++++++++++++++++---------------- tests/readers/test_CSVReader.py | 13 ++++----- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/sorcha/readers/CSVReader.py b/src/sorcha/readers/CSVReader.py index 171a8207..ffca500d 100755 --- a/src/sorcha/readers/CSVReader.py +++ b/src/sorcha/readers/CSVReader.py @@ -133,14 +133,18 @@ def _check_header_line(self, header_line): sys.exit(error_str) def _validate_csv(self, header): - """Perform a validation of the CSV file, such as checking - for blank lines. + """Perform a validation of the CSV file, such as checking for blank lines. + + This is an expensive test and should only be performed when something + has gone wrong. This is needed because panda's read_csv() function can + given vague errors (such as failing with an index error if the file + has blank lines at the end). Parameters ---------- header : integer The row number of the header. - + Returns ------- : bool @@ -154,9 +158,7 @@ def _validate_csv(self, header): # Check for blank lines. We do this explicitly because pandas read_csv() # has problems when skipping lines and finding blank lines at the end. if len(line) == 0 or line.isspace(): - error_str = ( - f"ERROR: CSVReader: found a blank line on line {i} " f" of {self.filename}." - ) + error_str = f"ERROR: CSVReader: found a blank line on line {i} of {self.filename}." pplogger.error(error_str) sys.exit(error_str) return True @@ -193,7 +195,6 @@ def _read_rows_internal(self, block_start=0, block_size=None, **kwargs): skip_rows.extend([i for i in range(self.header_row + 1, self.header_row + 1 + block_start)]) # Read the rows. - #try: if self.sep == "whitespace": res_df = pd.read_csv( self.filename, @@ -208,12 +209,6 @@ def _read_rows_internal(self, block_start=0, block_size=None, **kwargs): skiprows=skip_rows, nrows=block_size, ) - #except IndexError as current_exc: - # # Check if there is a more understandable error we can raise. - # self._validate_csv(self.header_row) - # - # # If we do not detect the a problem with _validate_csv, reraise the error. - # raise current_exc return res_df @@ -263,18 +258,25 @@ def _read_objects_internal(self, obj_ids, **kwargs): skipped_row.extend(~self.obj_id_table["ObjID"].isin(obj_ids).values) # Read the rows. - if self.sep == "whitespace": - res_df = pd.read_csv( - self.filename, - sep="\\s+", - skiprows=(lambda x: skipped_row[x]), - ) - else: - res_df = pd.read_csv( - self.filename, - delimiter=",", - skiprows=(lambda x: skipped_row[x]), - ) + try: + if self.sep == "whitespace": + res_df = pd.read_csv( + self.filename, + sep="\\s+", + skiprows=(lambda x: skipped_row[x]), + ) + else: + res_df = pd.read_csv( + self.filename, + delimiter=",", + skiprows=(lambda x: skipped_row[x]), + ) + except IndexError as current_exc: + # Check if there is a more understandable error we can raise. + self._validate_csv(self.header_row) + + # If we do not detect the a problem with _validate_csv, reraise the error. + raise current_exc return res_df def _process_and_validate_input_table(self, input_table, **kwargs): diff --git a/tests/readers/test_CSVReader.py b/tests/readers/test_CSVReader.py index e7d143e5..42fb0daf 100755 --- a/tests/readers/test_CSVReader.py +++ b/tests/readers/test_CSVReader.py @@ -400,7 +400,7 @@ def test_CSVDataReader_blank_lines(): """Test that we fail if the input file has blank lines.""" with tempfile.TemporaryDirectory() as dir_name: file_name = os.path.join(dir_name, "test.ecsv") - with open(file_name, 'w') as output: + with open(file_name, "w") as output: output.write("# My comment\n") output.write("ObjID,b,c\n") output.write("0,1,2\n") @@ -409,16 +409,15 @@ def test_CSVDataReader_blank_lines(): # The checks pass. reader = CSVDataReader(file_name, sep="csv", cache_table=False) - data = reader.read_objects([1, 2]) + data = reader.read_objects(["1", "2"]) assert len(data) == 2 - with open(file_name, 'a') as output: + with open(file_name, "a") as output: output.write("3,1,2\n") output.write("\n") # add another blank line output.write("\n") # add another blank line - output.write("\n") # add another blank line - + # The code now fails by default. reader2 = CSVDataReader(file_name, sep="csv", cache_table=False) - data2 = reader2.read_objects([1, 2]) - assert len(data2) == 2 + with pytest.raises(SystemExit): + _ = reader2.read_objects(["1", "2"]) From 1270ae04312613f19fb42dc33e2431a84b710ad7 Mon Sep 17 00:00:00 2001 From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com> Date: Mon, 13 Jan 2025 14:35:12 -0500 Subject: [PATCH 4/5] Update test_OrbitAuxReader.py --- tests/readers/test_OrbitAuxReader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/readers/test_OrbitAuxReader.py b/tests/readers/test_OrbitAuxReader.py index 66d0cd96..550415a1 100755 --- a/tests/readers/test_OrbitAuxReader.py +++ b/tests/readers/test_OrbitAuxReader.py @@ -307,6 +307,7 @@ def test_orbit_reader_wrong_delim(): with pytest.raises(SystemExit) as err: _ = OrbitAuxReader(get_test_filepath("testorb.des"), "csv") + def test_orbit_sanity_check_kep(): """If an orbit parameter is undefined, raise an exception""" @@ -335,7 +336,6 @@ def test_orbit_sanity_check_kep(): assert err.value.code == "ERROR: Invalid Keplerian elements detected for one or more objects (check log for information)" - def test_orbit_sanity_check_com(): """If an orbit parameter is undefined, raise an exception""" From 75b2fdf7a06e3b8c0163f3302a6631584963ae6f Mon Sep 17 00:00:00 2001 From: Meg Schwamb Date: Mon, 13 Jan 2025 20:12:03 +0000 Subject: [PATCH 5/5] Update CSVReader.py fix typo in comments --- src/sorcha/readers/CSVReader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sorcha/readers/CSVReader.py b/src/sorcha/readers/CSVReader.py index ffca500d..9e633a0f 100755 --- a/src/sorcha/readers/CSVReader.py +++ b/src/sorcha/readers/CSVReader.py @@ -275,7 +275,7 @@ def _read_objects_internal(self, obj_ids, **kwargs): # Check if there is a more understandable error we can raise. self._validate_csv(self.header_row) - # If we do not detect the a problem with _validate_csv, reraise the error. + # If we do not detect a problem with _validate_csv, reraise the error. raise current_exc return res_df