Merge pull request #1095 from dirac-institute/cvs_robustness

Improve error messaging in CSVReader
dirac-institute · Jan 13, 2025 · 06f2a44 · 06f2a44
2 parents 9e4594b + 75b2fdf
commit 06f2a44
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 14 deletions.
diff --git a/src/sorcha/readers/CSVReader.py b/src/sorcha/readers/CSVReader.py
@@ -40,6 +40,7 @@ def __init__(self, filename, sep="csv", header=-1, **kwargs):
             sys.exit(f"ERROR: Unrecognized delimiter ({sep})")
         self.sep = sep
 
+        # To pre-validation and collect the header information.
         self.header_row = self._find_and_validate_header_line(header)
 
         # A table holding just the object ID for each row. Only populated
@@ -131,6 +132,37 @@ def _check_header_line(self, header_line):
             pplogger.error(error_str)
             sys.exit(error_str)
 
+    def _validate_csv(self, header):
+        """Perform a validation of the CSV file, such as checking for blank lines.
+
+        This is an expensive test and should only be performed when something
+        has gone wrong.  This is needed because panda's read_csv() function can
+        given vague errors (such as failing with an index error if the file
+        has blank lines at the end).
+
+        Parameters
+        ----------
+        header : integer
+            The row number of the header.
+
+        Returns
+        -------
+        : bool
+            True indicating success.
+        """
+        pplogger = logging.getLogger(__name__)
+
+        with open(self.filename) as fh:
+            for i, line in enumerate(fh):
+                if i >= header:
+                    # Check for blank lines. We do this explicitly because pandas read_csv()
+                    # has problems when skipping lines and finding blank lines at the end.
+                    if len(line) == 0 or line.isspace():
+                        error_str = f"ERROR: CSVReader: found a blank line on line {i} of {self.filename}."
+                        pplogger.error(error_str)
+                        sys.exit(error_str)
+        return True
+
     def _read_rows_internal(self, block_start=0, block_size=None, **kwargs):
         """Reads in a set number of rows from the input.
 
@@ -177,6 +209,7 @@ def _read_rows_internal(self, block_start=0, block_size=None, **kwargs):
                 skiprows=skip_rows,
                 nrows=block_size,
             )
+
         return res_df
 
     def _build_id_map(self):
@@ -225,18 +258,25 @@ def _read_objects_internal(self, obj_ids, **kwargs):
         skipped_row.extend(~self.obj_id_table["ObjID"].isin(obj_ids).values)
 
         # Read the rows.
-        if self.sep == "whitespace":
-            res_df = pd.read_csv(
-                self.filename,
-                sep="\\s+",
-                skiprows=(lambda x: skipped_row[x]),
-            )
-        else:
-            res_df = pd.read_csv(
-                self.filename,
-                delimiter=",",
-                skiprows=(lambda x: skipped_row[x]),
-            )
+        try:
+            if self.sep == "whitespace":
+                res_df = pd.read_csv(
+                    self.filename,
+                    sep="\\s+",
+                    skiprows=(lambda x: skipped_row[x]),
+                )
+            else:
+                res_df = pd.read_csv(
+                    self.filename,
+                    delimiter=",",
+                    skiprows=(lambda x: skipped_row[x]),
+                )
+        except IndexError as current_exc:
+            # Check if there is a more understandable error we can raise.
+            self._validate_csv(self.header_row)
+
+            # If we do not detect a problem with _validate_csv, reraise the error.
+            raise current_exc
         return res_df
 
     def _process_and_validate_input_table(self, input_table, **kwargs):

diff --git a/tests/readers/test_CSVReader.py b/tests/readers/test_CSVReader.py
@@ -1,8 +1,10 @@
 import numpy as np
+import os
 import pandas as pd
 import pytest
 from numpy.testing import assert_equal
 from pandas.testing import assert_frame_equal
+import tempfile
 
 from sorcha.readers.CSVReader import CSVDataReader
 from sorcha.utilities.dataUtilitiesForTests import get_test_filepath
@@ -392,3 +394,30 @@ def test_CSVDataReader_delims():
     with pytest.raises(SystemExit) as e2:
         _ = CSVDataReader(get_test_filepath("testcolour.txt"), "")
     assert e2.type == SystemExit
+
+
+def test_CSVDataReader_blank_lines():
+    """Test that we fail if the input file has blank lines."""
+    with tempfile.TemporaryDirectory() as dir_name:
+        file_name = os.path.join(dir_name, "test.ecsv")
+        with open(file_name, "w") as output:
+            output.write("# My comment\n")
+            output.write("ObjID,b,c\n")
+            output.write("0,1,2\n")
+            output.write("1,1,2\n")
+            output.write("2,1,2\n")
+
+        # The checks pass.
+        reader = CSVDataReader(file_name, sep="csv", cache_table=False)
+        data = reader.read_objects(["1", "2"])
+        assert len(data) == 2
+
+        with open(file_name, "a") as output:
+            output.write("3,1,2\n")
+            output.write("\n")  # add another blank line
+            output.write("\n")  # add another blank line
+
+        # The code now fails by default.
+        reader2 = CSVDataReader(file_name, sep="csv", cache_table=False)
+        with pytest.raises(SystemExit):
+            _ = reader2.read_objects(["1", "2"])
diff --git a/tests/readers/test_OrbitAuxReader.py b/tests/readers/test_OrbitAuxReader.py
@@ -307,6 +307,7 @@ def test_orbit_reader_wrong_delim():
     with pytest.raises(SystemExit) as err:
         _ = OrbitAuxReader(get_test_filepath("testorb.des"), "csv")
 
+
 def test_orbit_sanity_check_kep():
     """If an orbit parameter is undefined, raise an exception"""
 
@@ -335,7 +336,6 @@ def test_orbit_sanity_check_kep():
     assert err.value.code == "ERROR: Invalid Keplerian elements detected for one or more objects (check log for information)"
 
 
-
 def test_orbit_sanity_check_com():
     """If an orbit parameter is undefined, raise an exception"""
 
@@ -350,4 +350,3 @@ def test_orbit_sanity_check_com():
         _ = OrbitAuxReader(get_test_filepath("orbit_test_files/orbit_sanity_check_com_e<0.csv"),"csv")
         _.read_rows()
     assert err.value.code == "ERROR: Invalid cometary elements detected for one or more objects (check log for information)"
-