Skip to content

Commit

Permalink
Merge pull request #38 from cfpb/features/31_switch_to_two-phased_val…
Browse files Browse the repository at this point in the history
…idation_scheme

Switched to two phase validation and deleted schema.py
  • Loading branch information
nargis-sultani authored Aug 26, 2023
2 parents 30e4650 + 44b1f50 commit 810de45
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 3,400 deletions.
26 changes: 25 additions & 1 deletion src/validator/create_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
with validations listed in phase 1 and phase 2."""

from pandera import DataFrameSchema
from pandera.errors import SchemaErrors
from phase_validations import get_phase_1_and_2_validations_for_lei
from schema_template import get_template

Expand All @@ -13,11 +14,34 @@


def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
for column, validations in get_phase_1_and_2_validations_for_lei(lei):
for column in get_phase_1_and_2_validations_for_lei(lei):
validations = get_phase_1_and_2_validations_for_lei(lei)[column]
template[column].checks = validations[phase]
return DataFrameSchema(template)


def print_schema_errors(errors: SchemaErrors, phase: str):
for error in errors.schema_errors:
# Name of the column in the dataframe being checked
schema_error = error["error"]
column_name = schema_error.schema.name

# built in checks such as unique=True are different than custom
# checks unfortunately so the name needs to be accessed differently
try:
check_name = schema_error.check.name
# This will either be a boolean series or a single bool
check_output = schema_error.check_output
except AttributeError:
check_name = schema_error.check
# this is just a string that we'd need to parse manually
check_output = schema_error.args[0]

print(f"{phase} Validation `{check_name}` failed for column `{column_name}`")
print(check_output)
print("")


def get_phase_1_schema_for_lei(lei: str = None):
return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)

Expand Down
36 changes: 16 additions & 20 deletions src/validator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
import sys

import pandas as pd
from create_schemas import (
get_phase_1_schema_for_lei,
get_phase_2_schema_for_lei,
print_schema_errors,
)
from pandera.errors import SchemaErrors
from schema import get_schema_for_lei


def csv_to_df(path: str) -> pd.DataFrame:
Expand All @@ -28,29 +32,21 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
print(df)
print("")

sblar_schema = get_schema_for_lei(lei)
phase_1_failure_cases = None

phase_1_sblar_schema = get_phase_1_schema_for_lei(lei)
try:
sblar_schema(df, lazy=True)
phase_1_sblar_schema(df, lazy=True)
except SchemaErrors as errors:
for error in errors.schema_errors:
# Name of the column in the dataframe being checked
column_name = error["error"].schema.name
phase_1_failure_cases = errors.failure_cases
print_schema_errors(errors, "Phase 1")

# built in checks such as unique=True are different than custom
# checks unfortunately so the name needs to be accessed differently
try:
check_name = error["error"].check.name
# This will either be a boolean series or a single bool
check_output = error["error"].check_output
except AttributeError:
check_name = error["error"].check
# this is just a string that we'd need to parse manually
check_output = error["error"].args[0]

print(f"Validation `{check_name}` failed for column `{column_name}`")
print(check_output)
print("")
if phase_1_failure_cases is None:
phase_2_sblar_schema = get_phase_2_schema_for_lei(lei)
try:
phase_2_sblar_schema(df, lazy=True)
except SchemaErrors as errors:
print_schema_errors(errors, "Phase 2")


if __name__ == "__main__":
Expand Down
88 changes: 70 additions & 18 deletions src/validator/phase_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None):
string_contains,
name="uid.invalid_uid_lei",
description=(
"The first 20 characters of the 'unique identifier' should match "
"the Legal Entity Identifier (LEI) for the financial institution."
"The first 20 characters of the 'unique identifier' should"
" match the Legal Entity Identifier (LEI) for the financial"
" institution."
),
element_wise=True,
containing_value=lei,
Expand Down Expand Up @@ -950,11 +951,11 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None):
description=(
"When 'interest rate type' does not equal 1"
" (adjustable interest rate, no initial rate period),"
" 3 (initial rate period > 12 months, adjustable interest rate),"
" or 5 (initial rate period <= 12 months, variable interest"
" rate), 'adjustable rate transaction: margin' must be blank."
" When 'interest rate type' equals 1, 3, or 5, 'variable"
" rate transaction: margin' must not be blank."
" 3 (initial rate period > 12 months, adjustable interest"
" rate), or 5 (initial rate period <= 12 months, variable "
"interest rate), 'adjustable rate transaction: margin' must "
"be blank. When 'interest rate type' equals 1, 3, or 5, "
"'variable rate transaction: margin' must not be blank."
),
groupby="pricing_interest_rate_type",
condition_values={"1", "3", "5"},
Expand All @@ -978,8 +979,8 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None):
is_valid_enum,
name="pricing_adj_index_name.invalid_enum_value",
description=(
"'Adjustable rate transaction: index name' must equal 1, 2, 3, 4,"
"5, 6, 7, 8, 9, 10, 977, or 999."
"'Adjustable rate transaction: index name' must equal "
"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 977, or 999."
),
element_wise=True,
accepted_values=[
Expand Down Expand Up @@ -1036,8 +1037,8 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None):
max_value=300,
name="pricing_adj_index_name_ff.invalid_text_length",
description=(
"'Adjustable rate transaction: index name: other' must not exceed"
"300 characters in length."
"'Adjustable rate transaction: index name: other' must not"
" exceed 300 characters in length."
),
),
],
Expand Down Expand Up @@ -1131,20 +1132,19 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None):
],
"phase_2": [],
},
"pricing_mca_addcost_flag": {"phase_1": [], "phase_2": []},
"pricing_mca_addcost": {"phase_1": [], "phase_2": []},
"pricing_prepenalty_allowed": {
"pricing_mca_addcost_flag": {
"phase_1": [
SBLCheck(
is_valid_enum,
name="pricing_prepenalty_allowed.invalid_enum_value",
name="pricing_mca_addcost_flag.invalid_enum_value",
description=(
"'Prepayment penalty could be imposed' must equal 1, 2, or 999."
"'MCA/sales-based: additional cost for merchant cash "
"advances or other sales-based financing: NA flag' "
"must equal 900 or 999."
),
element_wise=True,
accepted_values=[
"1",
"2",
"900",
"999",
],
),
Expand Down Expand Up @@ -1172,6 +1172,58 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None):
),
],
},
"pricing_mca_addcost": {
"phase_1": [
SBLCheck(
is_number,
name="pricing_mca_addcost.invalid_numeric_format",
description=(
"When present, 'MCA/sales-based: additional cost for "
"merchant cash advances or other sales-based financing' "
"must be a numeric value"
),
element_wise=True,
accept_blank=True,
),
],
"phase_2": [
SBLCheck(
has_no_conditional_field_conflict,
name="pricing_mca_addcost.conditional_field_conflict",
description=(
"When 'MCA/sales-based: additional cost for merchant "
"cash advances or other sales-based financing: NA flag' "
"does not equal 900 (applicable), 'MCA/sales-based: "
"additional cost for merchant cash advances or other "
"sales-based financing' must be blank. When 'MCA/sales-based: "
"additional cost for merchant cash advances or other "
"sales-based financing: NA flag' equals 900, MCA/sales-based: "
"additional cost for merchant cash advances or other "
"sales-based financing’ must not be blank."
),
groupby="pricing_mca_addcost_flag",
condition_values={"900"},
),
],
},
"pricing_prepenalty_allowed": {
"phase_1": [
SBLCheck(
is_valid_enum,
name="pricing_prepenalty_allowed.invalid_enum_value",
description=(
"'Prepayment penalty could be imposed' must equal 1, 2, or 999."
),
element_wise=True,
accepted_values=[
"1",
"2",
"999",
],
),
],
"phase_2": [],
},
"pricing_prepenalty_exists": {
"phase_1": [
SBLCheck(
Expand Down
Loading

0 comments on commit 810de45

Please sign in to comment.