Skip to content

Commit

Permalink
Ujson and chunking (#183)
Browse files Browse the repository at this point in the history
Closes #181 
Closes #182
  • Loading branch information
jcadam14 authored May 15, 2024
1 parent 190a3aa commit 2da1812
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 65 deletions.
95 changes: 91 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pandas = "^2.2.2"
pandera = "^0.19.3"
requests = "^2.31.0"
tabulate = "^0.9.0"
ujson = "^5.9.0"

[tool.poetry.group.dev.dependencies]
pytest = "8.2.0"
Expand Down
14 changes: 5 additions & 9 deletions src/regtech_data_validator/create_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,11 @@ def validate(schema: DataFrameSchema, submission_df: pd.DataFrame) -> tuple[bool
schema(submission_df, lazy=True)
except SchemaErrors as err:
is_valid = False

check_findings = []
# NOTE: `type: ignore` because SchemaErrors.schema_errors is supposed to be
# `list[dict[str,Any]]`, but it's actually of type `SchemaError`
schema_error: SchemaError

for schema_error in err.schema_errors: # type: ignore
check = schema_error.check
column_name = schema_error.schema.name
Expand All @@ -145,27 +146,21 @@ def validate(schema: DataFrameSchema, submission_df: pd.DataFrame) -> tuple[bool
raise RuntimeError(
f'Check {check} type on {column_name} column not supported. Must be of type {SBLCheck}'
) from schema_error

fields = _get_check_fields(check, column_name)

check_output: pd.Series | None = schema_error.check_output

if check_output is not None:
# Filter data not associated with failed Check, and update index for merging with findings_df
failed_records_df = _filter_valid_records(submission_df, check_output, fields)
failed_records_df.index += next_finding_no
next_finding_no = failed_records_df.tail(1).index + 1 # type: ignore

failed_record_fields_df = _records_to_fields(failed_records_df)
check_findings_df = _add_validation_metadata(failed_record_fields_df, check)

findings_df = pd.concat([findings_df, check_findings_df])
check_findings.append(_add_validation_metadata(failed_record_fields_df, check))
else:
# The above exception handling _should_ prevent this from ever happenin, but...just in case.
raise RuntimeError(f'No check output for "{check.name}" check. Pandera SchemaError: {schema_error}')

findings_df = pd.concat(check_findings)
updated_df = add_uid(findings_df, submission_df)

return is_valid, updated_df


Expand All @@ -182,6 +177,7 @@ def add_uid(results_df: pd.DataFrame, submission_df: pd.DataFrame) -> pd.DataFra


def validate_phases(df: pd.DataFrame, context: dict[str, str] | None = None) -> tuple[bool, pd.DataFrame]:

p1_is_valid, p1_findings = validate(get_phase_1_schema_for_lei(context), df)

if not p1_is_valid:
Expand Down
Loading

0 comments on commit 2da1812

Please sign in to comment.