From 971dc9156cb2c8f9f93d36682f933facd1e70aed Mon Sep 17 00:00:00 2001 From: jcadam14 <41971533+jcadam14@users.noreply.github.com> Date: Tue, 30 Apr 2024 12:00:08 -0600 Subject: [PATCH] 168 see if the df to json can be improved using pivot tables similar to the df to download (#170) Closes #168 Wasn't able to use pivot tables, but in looking at the df.to_json() output and understanding that structure, found a much quicker way to translate it to the json we want using just python dict references instead of the dataframes. Significant improvement in processing time, especially as the dataset grows. --- src/regtech_data_validator/data_formatters.py | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py index 093e10c3..96393379 100644 --- a/src/regtech_data_validator/data_formatters.py +++ b/src/regtech_data_validator/data_formatters.py @@ -104,32 +104,53 @@ def df_to_table(df: pd.DataFrame) -> str: def df_to_json(df: pd.DataFrame) -> str: - findings_json = [] + output_json = [] if not df.empty: - - for _, group_df in df.groupby(['validation_id']): - v_head = group_df.iloc[0] - - finding_json = { + df.reset_index(drop=True, inplace=True) + findings_json = json.loads(df.to_json(orient='columns')) + + grouped_data = {} + for i in range(len(findings_json['record_no'])): + validation_id = findings_json['validation_id'][str(i)] + if validation_id not in grouped_data: + grouped_data[validation_id] = [] + grouped_data[validation_id].append( + { + 'record_no': findings_json['record_no'][str(i)], + 'uid': findings_json['uid'][str(i)], + 'field_name': findings_json['field_name'][str(i)], + 'field_value': findings_json['field_value'][str(i)], + } + ) + + for validation_id, records in grouped_data.items(): + for key, value in findings_json['validation_id'].items(): + if validation_id == value: + validation_key = key + break + validation_info = { 'validation': { - 'id': v_head['validation_id'], - 'name': v_head['validation_name'], - 'description': v_head['validation_desc'], - 'severity': v_head['validation_severity'], - 'scope': v_head['scope'], - 'fig_link': v_head['fig_link'], + 'id': validation_id, + 'name': findings_json['validation_name'][validation_key], + 'description': findings_json['validation_desc'][validation_key], + 'severity': findings_json['validation_severity'][validation_key], + 'scope': findings_json['scope'][validation_key], + 'fig_link': findings_json['fig_link'][validation_key], }, 'records': [], } - findings_json.append(finding_json) - - for _, rec_df in group_df.groupby(by='record_no'): - rec = rec_df.iloc[0] - record_json = {'record_no': int(rec['record_no']), 'uid': rec['uid'], 'fields': []} - finding_json['records'].append(record_json) - - fields = rec_df.iterrows() if v_head['validation_id'] in more_than_2_fields else rec_df[::-1].iterrows() - for _, field_data in fields: - record_json['fields'].append({'name': field_data['field_name'], 'value': field_data['field_value']}) - return json.dumps(findings_json, indent=4) + records_dict = {} + for record in records: + record_no = record['record_no'] + if record_no not in records_dict: + records_dict[record_no] = {'record_no': record['record_no'], 'uid': record['uid'], 'fields': []} + records_dict[record_no]['fields'].append({'name': record['field_name'], 'value': record['field_value']}) + validation_info['records'] = list(records_dict.values()) + for record in validation_info['records']: + if len(record['fields']) == 2: + record['fields'][0], record['fields'][1] = record['fields'][1], record['fields'][0] + output_json.append(validation_info) + + output_json = sorted(output_json, key=lambda x: x['validation']['id']) + return json.dumps(output_json, indent=4)