Skip to content

Commit

Permalink
168 see if the df to json can be improved using pivot tables similar …
Browse files Browse the repository at this point in the history
…to the df to download (#170)

Closes #168 

Wasn't able to use pivot tables, but in looking at the df.to_json()
output and understanding that structure, found a much quicker way to
translate it to the json we want using just python dict references
instead of the dataframes. Significant improvement in processing time,
especially as the dataset grows.
  • Loading branch information
jcadam14 authored Apr 30, 2024
1 parent 7b0aeff commit 971dc91
Showing 1 changed file with 44 additions and 23 deletions.
67 changes: 44 additions & 23 deletions src/regtech_data_validator/data_formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,32 +104,53 @@ def df_to_table(df: pd.DataFrame) -> str:


def df_to_json(df: pd.DataFrame) -> str:
findings_json = []
output_json = []

if not df.empty:

for _, group_df in df.groupby(['validation_id']):
v_head = group_df.iloc[0]

finding_json = {
df.reset_index(drop=True, inplace=True)
findings_json = json.loads(df.to_json(orient='columns'))

grouped_data = {}
for i in range(len(findings_json['record_no'])):
validation_id = findings_json['validation_id'][str(i)]
if validation_id not in grouped_data:
grouped_data[validation_id] = []
grouped_data[validation_id].append(
{
'record_no': findings_json['record_no'][str(i)],
'uid': findings_json['uid'][str(i)],
'field_name': findings_json['field_name'][str(i)],
'field_value': findings_json['field_value'][str(i)],
}
)

for validation_id, records in grouped_data.items():
for key, value in findings_json['validation_id'].items():
if validation_id == value:
validation_key = key
break
validation_info = {
'validation': {
'id': v_head['validation_id'],
'name': v_head['validation_name'],
'description': v_head['validation_desc'],
'severity': v_head['validation_severity'],
'scope': v_head['scope'],
'fig_link': v_head['fig_link'],
'id': validation_id,
'name': findings_json['validation_name'][validation_key],
'description': findings_json['validation_desc'][validation_key],
'severity': findings_json['validation_severity'][validation_key],
'scope': findings_json['scope'][validation_key],
'fig_link': findings_json['fig_link'][validation_key],
},
'records': [],
}
findings_json.append(finding_json)

for _, rec_df in group_df.groupby(by='record_no'):
rec = rec_df.iloc[0]
record_json = {'record_no': int(rec['record_no']), 'uid': rec['uid'], 'fields': []}
finding_json['records'].append(record_json)

fields = rec_df.iterrows() if v_head['validation_id'] in more_than_2_fields else rec_df[::-1].iterrows()
for _, field_data in fields:
record_json['fields'].append({'name': field_data['field_name'], 'value': field_data['field_value']})
return json.dumps(findings_json, indent=4)
records_dict = {}
for record in records:
record_no = record['record_no']
if record_no not in records_dict:
records_dict[record_no] = {'record_no': record['record_no'], 'uid': record['uid'], 'fields': []}
records_dict[record_no]['fields'].append({'name': record['field_name'], 'value': record['field_value']})
validation_info['records'] = list(records_dict.values())
for record in validation_info['records']:
if len(record['fields']) == 2:
record['fields'][0], record['fields'][1] = record['fields'][1], record['fields'][0]
output_json.append(validation_info)

output_json = sorted(output_json, key=lambda x: x['validation']['id'])
return json.dumps(output_json, indent=4)

0 comments on commit 971dc91

Please sign in to comment.