diff --git a/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py b/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py new file mode 100644 index 0000000000..12b0d59cf2 --- /dev/null +++ b/data_steward/analytics/cdr_ops/ad_hoc_analyses/search_sandbox_for_ids.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.7.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# Purpose: Use this notebook to search for ids in sandbox datasets + +# + tags=["parameters"] +project_id = '' +sandbox_dataset_id = '' # Sandbox dataset to search in for the problem ids +search_field = '' # field in the sandbox tables expected to contain the ids. Example: observation_id +run_as = '' + +# + +from utils import auth +import pandas as pd +from gcloud.bq import BigQueryClient +from common import JINJA_ENV +from analytics.cdr_ops.notebook_utils import execute, IMPERSONATION_SCOPES, render_message + +pd.set_option('display.max_rows', None) +# - + +impersonation_creds = auth.get_impersonation_credentials( + run_as, target_scopes=IMPERSONATION_SCOPES) + +client = BigQueryClient(project_id, credentials=impersonation_creds) + +# # Create list of ids to search +# Run the following cell to create a list of ids to search for. Recommend using a LIMIT if the list is quite large.
+# OR
+# Manually create a list of ids called ids_list + +# + +tpl = JINJA_ENV.from_string(''' +{INSERT QUERY HERE} +''') +query = tpl.render() +ids = execute(client, query) + +ids_list = ids[search_field].to_list() + + +# - + +# # Get the tables that contain the search_field, from the sandbox dataset +# +# The query will return the sandbox tables in the order of their creation time. Earliest to latest. + +# + +tpl = JINJA_ENV.from_string(''' + +SELECT + c.*, t.creation_time + , ROW_NUMBER() OVER (ORDER BY t.creation_time) as run_order +FROM + `{{project_id}}.{{sandbox_dataset_id}}.INFORMATION_SCHEMA.COLUMNS` AS c +JOIN + `{{project_id}}.{{sandbox_dataset_id}}.INFORMATION_SCHEMA.TABLES` AS t +ON + c.table_name = t.table_name +WHERE + c.column_name = '{{search_field}}' +ORDER BY + t.creation_time; + +''') +query = tpl.render(sandbox_dataset_id=sandbox_dataset_id, + project_id=project_id, + search_field=search_field) +tables_in_dataset = execute(client, query) + +tables_list = tables_in_dataset['table_name'].to_list() +tables_list +# - + +# # Search in each sandbox table and print results + +queries = [] +for table in tables_list: + tpl = JINJA_ENV.from_string(''' + SELECT + '{{table}}' as table, + COUNT(*) AS n_{{search_field}}s_found + FROM + `{{project_id}}.{{sandbox_dataset_id}}.{{table}}` + WHERE {{search_field}} IN UNNEST ({{ids_list}}) + ''') + query = tpl.render(sandbox_dataset_id=sandbox_dataset_id, + project_id=project_id, + table=table, + ids_list=ids_list, + search_field=search_field) + queries.append(query) +df = execute(client, '\nUNION ALL\n'.join(queries)) + + +# # Order and view the results + +# + +# Define the run order +df['run_order'] = pd.Categorical(df['table'], + categories=tables_list, + ordered=True) + +# Sort the results +ordered_df = ( + df.sort_values(by='run_order') + .iloc[:, :2] + .reset_index(drop=True) +) + +ordered_df +# - + +