Ujson and chunking (#183)

Closes #181 Closes #182
cfpb · May 15, 2024 · 2da1812 · 2da1812
1 parent 190a3aa
commit 2da1812
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 65 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ pandas = "^2.2.2"
 pandera = "^0.19.3"
 requests = "^2.31.0"
 tabulate = "^0.9.0"
+ujson = "^5.9.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "8.2.0"

diff --git a/src/regtech_data_validator/create_schemas.py b/src/regtech_data_validator/create_schemas.py
@@ -126,10 +126,11 @@ def validate(schema: DataFrameSchema, submission_df: pd.DataFrame) -> tuple[bool
         schema(submission_df, lazy=True)
     except SchemaErrors as err:
         is_valid = False
-
+        check_findings = []
         # NOTE: `type: ignore` because SchemaErrors.schema_errors is supposed to be
         #       `list[dict[str,Any]]`, but it's actually of type `SchemaError`
         schema_error: SchemaError
+
         for schema_error in err.schema_errors:  # type: ignore
             check = schema_error.check
             column_name = schema_error.schema.name
@@ -145,27 +146,21 @@ def validate(schema: DataFrameSchema, submission_df: pd.DataFrame) -> tuple[bool
                 raise RuntimeError(
                     f'Check {check} type on {column_name} column not supported. Must be of type {SBLCheck}'
                 ) from schema_error
-
             fields = _get_check_fields(check, column_name)
-
             check_output: pd.Series | None = schema_error.check_output
 
             if check_output is not None:
                 # Filter data not associated with failed Check, and update index for merging with findings_df
                 failed_records_df = _filter_valid_records(submission_df, check_output, fields)
                 failed_records_df.index += next_finding_no
                 next_finding_no = failed_records_df.tail(1).index + 1  # type: ignore
-
                 failed_record_fields_df = _records_to_fields(failed_records_df)
-                check_findings_df = _add_validation_metadata(failed_record_fields_df, check)
-
-                findings_df = pd.concat([findings_df, check_findings_df])
+                check_findings.append(_add_validation_metadata(failed_record_fields_df, check))
             else:
                 # The above exception handling _should_ prevent this from ever happenin, but...just in case.
                 raise RuntimeError(f'No check output for "{check.name}" check.  Pandera SchemaError: {schema_error}')
-
+        findings_df = pd.concat(check_findings)
     updated_df = add_uid(findings_df, submission_df)
-
     return is_valid, updated_df
 
 
@@ -182,6 +177,7 @@ def add_uid(results_df: pd.DataFrame, submission_df: pd.DataFrame) -> pd.DataFra
 
 
 def validate_phases(df: pd.DataFrame, context: dict[str, str] | None = None) -> tuple[bool, pd.DataFrame]:
+
     p1_is_valid, p1_findings = validate(get_phase_1_schema_for_lei(context), df)
 
     if not p1_is_valid: