Skip to content

Commit

Permalink
[DC-3367] Add the standard mappings checks
Browse files Browse the repository at this point in the history
  • Loading branch information
brendagutman committed Jan 8, 2024
1 parent b518b83 commit 82174b4
Showing 1 changed file with 23 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,16 +165,9 @@ def read_csv_file(file_path, table_file_name, custom_column_names=None):
OR concept_id IN UNNEST ({search_for_standards_concept_list})
''')

# +
# initial cleaning of the current ppi vocabulary for analysis use.
current_vocabulary['concept_code']=current_vocabulary['concept_code'].str.lower()

# # create filter_vocabulary. Use when joining via concept_code or concept_name
# filter_vocabulary= current_vocabulary[['concept_code','concept_name']]

# # create filter_vocabulary_ids. Use when joining via concept_code or concept_id
# filter_vocabulary_ids= current_vocabulary[['concept_code','concept_id']]


# -

Expand Down Expand Up @@ -245,7 +238,6 @@ def get_concept_data(columns:list):

# cleaning the generated mock concept data
mock_concept = build_concept.drop_duplicates(keep='first')
mock_concept

# +
# Create and insert the Module Concept
Expand Down Expand Up @@ -289,7 +281,6 @@ def get_concept_data(columns:list):
#
# TODO: descriptive concepts can have branching logic. These mappings do not need to be created. Use only branching logic where neither concept is a descriptive field_type

# +
# Create half of the CONCEPT_RELATIONSHIP table
# These are all relationships from parent to child and concept to standard.
build_concept_relationship_query = (f'''
Expand Down Expand Up @@ -389,12 +380,8 @@ def get_concept_data(columns:list):
build_cr = execute(final_build_query)
half_concept_relationship = build_cr.drop_duplicates(keep='first')

half_concept_relationship
# -

# # copy to make a base df for the reverse mappings
reverse_half_cr = half_concept_relationship.copy()
reverse_half_cr

# +
# create the reverse mappings
Expand Down Expand Up @@ -437,8 +424,6 @@ def get_concept_data(columns:list):
print(f'length of concept_relationship without duplicates and ignored relationships {len(mock_concept_relationship)}')
# -

mock_concept_relationship

mock_cr_w_ids = mock_concept_relationship.copy()

# +
Expand Down Expand Up @@ -472,18 +457,15 @@ def get_concept_data(columns:list):
reformat_cr['concept_id_2'] = reformat_cr['concept_id_2'].fillna(0).astype(int)

reformat_cr
# -

# +
# remove the 'mapping to self'(maps to, mapping from) for the concepts odysseus has not flagged as Standard
# list non-standards
ody_nonstandards = ody_concept.loc[ody_concept['standard_concept'].isna(), 'concept_id'].to_list()
# df without the non-standard mapping to self relationships
cr_removed_ns_mapping = reformat_cr[~((reformat_cr['concept_id_1'] == reformat_cr['concept_id_2'])
& reformat_cr['concept_id_1'].isin(ody_nonstandards))]

cr_removed_ns_mapping
# -

mock_fin = cr_removed_ns_mapping.copy()

# # Concept table checks
Expand Down Expand Up @@ -619,10 +601,6 @@ def get_concept_data(columns:list):
already_exist_in_athena = athena_join2[athena_join2['concept_code_exists_in_athena'] == 'True']
already_exist_in_athena

athena_join2

# Concept check: Not mapped concepts, already mapped concepts, ids not unique

# # Print Feedback concept

# +
Expand All @@ -647,6 +625,8 @@ def get_concept_data(columns:list):
feedback_with_athena.to_csv(f'data_storage/mapping_files/{feedback_c}')
# -

# # Concept table checks

# ## concept check: concept_class_id

# Ignore module codes
Expand Down Expand Up @@ -683,19 +663,12 @@ def get_concept_data(columns:list):
standards_check
# -



# ## TODO Add second standard check. Are the concepts marked standard mapped to themselves and are the non-standard ones mapped to another vocabulary.
#

# # TODO descriptive types are seen in the cr file. stop this
# # TODO nulls are present in cr file. stop this
#
#
# # cr feedback concept_id_1 for ace concepts not showing up?
#

# # Concept_relationship table checks
# ## TODO
# * Add second standard check. Are the concepts marked standard mapped to themselves and are the non-standard ones mapped to another vocabulary.
# * descriptive types are seen in the cr file. stop this
# * nulls are present in cr file. stop this
# * cr feedback concept_id_1 for ace concepts not showing up?
# * Concept_relationship table checks

mock_vs_ody = mock_fin.merge(ody_cr,
how = 'outer',
Expand Down Expand Up @@ -725,20 +698,10 @@ def get_concept_data(columns:list):
approved_rel=approved_rel.sort_values(by='concept_id_1', ascending=True).reset_index(drop=True)
approved_rel

# # sanity check pop up analysis. Many of the results above were linked to these two concept_ids which are a known issue for odysseus to fix.
# # look at the ones that are not linked to the ids
# id_list = [903079.0,903087.0]
# issues_added = added_rel[
# ( ~added_rel['concept_id_1'].isin(id_list) )&
# (~added_rel['concept_id_2'].isin(id_list))
# ].reset_index(drop=True)

# # Transformation
# manual edits to send back to odysseus
# Create a table that shows the status of each concept_code (accepted, concept_code_does not match dictionary, no matching concept_id)

ody_cr

# step 1 make a copy to work off of
ody_cr_no_ids = ody_cr.copy()

Expand Down Expand Up @@ -868,9 +831,6 @@ def get_concept_data(columns:list):
cr_merge_missing['concept_code_2'] = cr_merge_missing['concept_code_2_x'].combine_first(cr_merge_missing['concept_code_2_y'])
cr_merge_missing['status'] = cr_merge_missing['status_x'].combine_first(cr_merge_missing['status_y'])
cr_merge_missing = cr_merge_missing.drop(columns={'concept_code_1_x', 'concept_code_1_y', 'concept_code_2_x', 'concept_code_2_y', 'status_x', 'status_y'})

cr_merge_missing

# -

base_cr_with_status = cr_merge_missing.copy()
Expand Down Expand Up @@ -898,14 +858,15 @@ def get_concept_data(columns:list):
concepts_w_incorrect_domain = vocab_filter.loc[vocab_filter['domain_id'] != 'Observation', 'concept_id'].to_list()


standard_domain_failures = base_cr_with_status[base_cr_with_status['concept_id_1'].isin(concepts_w_incorrect_domain)]
standard_domain_failures = base_cr_with_status[(base_cr_with_status['concept_id_1'].isin(concepts_w_incorrect_domain))
|(base_cr_with_status['concept_id_2'].isin(concepts_w_incorrect_domain))]

base_cr_with_status['status'] = np.where(
base_cr_with_status['concept_id_1'].isin(concepts_w_incorrect_domain),
base_cr_with_status['concept_id_1'].isin(concepts_w_incorrect_domain)
|base_cr_with_status['concept_id_2'].isin(concepts_w_incorrect_domain),
'ody generated. Standard concept domain issue',
base_cr_with_status['status']
)
base_cr_with_status

# +
# These mappings are generally decided by odysseus. This is created as a visual check of these mappings.
Expand Down Expand Up @@ -967,5 +928,14 @@ def get_concept_data(columns:list):

feedback_with_athena_cr.to_csv(f'data_storage/mapping_files/{feedback_cr}')
# -

# Check for:
# concept_feedback
# 1. No concepts should already exist in Athena.
# 2. All new questions and answers should be given a new concept. None should have the status 'no matching concept_id'
# 3. ids not unique
#
# concept_relationship_feedback
# 1. Not mapped concepts
# 2. already mapped concepts
#

0 comments on commit 82174b4

Please sign in to comment.