From ffed594d6a2938f8c00e600b8a98a2a2063b0999 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Sun, 14 Jul 2024 16:37:42 -0400 Subject: [PATCH 1/5] Add a few draft notebooks for querying the gks json files --- notebooks/ASHG-clinvar-GKS-duckdb.ipynb | 265 +++++++++ notebooks/ASHG-clinvar-GKS-python.ipynb | 562 ++++++++++++++++++ notebooks/ASHG-clinvar-GKS_DRAFT.ipynb | 736 ++++++++++++++++++++++++ 3 files changed, 1563 insertions(+) create mode 100644 notebooks/ASHG-clinvar-GKS-duckdb.ipynb create mode 100644 notebooks/ASHG-clinvar-GKS-python.ipynb create mode 100644 notebooks/ASHG-clinvar-GKS_DRAFT.ipynb diff --git a/notebooks/ASHG-clinvar-GKS-duckdb.ipynb b/notebooks/ASHG-clinvar-GKS-duckdb.ipynb new file mode 100644 index 0000000..0fcca79 --- /dev/null +++ b/notebooks/ASHG-clinvar-GKS-duckdb.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "import gzip\n", + "import json\n", + "import os\n", + "\n", + "from clinvar_gk_pilot.gcs import (\n", + " _local_file_path_for,\n", + " download_to_local_file,\n", + " already_downloaded,\n", + ")\n", + "\n", + "# variation_blob_uri = \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-variation-20240407.json.gz\"\n", + "# scv_blob_uri = (\n", + "# \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-scvs-20240407.json.gz\"\n", + "# )\n", + "\n", + "catvar_blob_uri = (\n", + " \"gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz\"\n", + ")\n", + "scv_blob_uri = \"gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz\"\n", + "\n", + "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "\n", + "\n", + "variation_local_file_path = _local_file_path_for(catvar_blob_uri)\n", + "if not already_downloaded(catvar_blob_uri):\n", + " print(f\"Downloading {catvar_blob_uri} to {variation_local_file_path}\")\n", + " dl_variation_local_file_path = download_to_local_file(catvar_blob_uri)\n", + " assert dl_variation_local_file_path == variation_local_file_path\n", + "\n", + "scv_local_file_path = _local_file_path_for(scv_blob_uri)\n", + "if not already_downloaded(scv_blob_uri):\n", + " print(f\"Downloading {scv_blob_uri} to {scv_local_file_path}\")\n", + " dl_scv_local_file_path = download_to_local_file(scv_blob_uri)\n", + " assert dl_scv_local_file_path == scv_local_file_path\n", + "\n", + "catvar_file = variation_local_file_path\n", + "scv_file = scv_local_file_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our ClinVar datasets are available as NDJSON files. There is a variation file and an SCV file. The records of the variation file are CategoricalVariation objects, and the records of the SCV file are `VariationPathogenicity` (sub-class of `Statement`)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "################################\n", + "# Query the SCV file for a VRS ID using vanilla Python\n", + "#\n", + "# - for a given ClinVar Variation ID, find the corresponding GA4GH CatVar record in the CatVar\n", + "# file and find the SCVs which reference that variant in the SCV file\n", + "#\n", + "# (NOTE: the SCV file also contains the full CatVar definition in the \"variation\" field, but\n", + "# this example illustrates how to query across both files, since the SCV file can be\n", + "# relationally normalized to extract that redundant entity and refer to the variant\n", + "# by the CatVar ID as a foreign key)\n", + "#\n", + "# - print the SCV interpretations for that variant\n", + "#\n", + "################################\n", + "################################\n", + "# Inputs\n", + "\n", + "################################\n", + "# A CanonicalAllele\n", + "## For searching based on the GKS Categorical Variation (CatVrs) ID\n", + "clinvar_id = \"563765\"\n", + "## For searching based on the GA4GH VRS Variation ID\n", + "vrs_id = \"ga4gh:VA.hf_va4AnlG99NuOjtaXJzh_XvszWWOO9\"\n", + "\n", + "\n", + "################################\n", + "# A CategoricalCnv\n", + "## For searching based on the GKS Categorical Variation (CatVrs) ID\n", + "clinvar_id = \"599353\"\n", + "## For searching based on the GA4GH VRS Variation ID\n", + "vrs_id = \"ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo\" # GRCh38 member\n", + "\n", + "\n", + "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "scv_file = \"combined-scv_output.ndjson.gz\"\n", + "################################\n", + "assert os.path.exists(catvar_file)\n", + "assert os.path.exists(scv_file)\n", + "catvar_id = f\"clinvar:{clinvar_id}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "While these can be read with vanilla python by iterating lines and parsing each as JSON, there are also libraries which can make querying the files simpler and potentially more performant.\n", + "\n", + "One option is DuckDB. DuckDB achieves fast speeds and low memory utilization by memory mapping files and dropping rows from memory that don't match filter criteria. It also has the benefit of being able to query GZIP-compressed NDJSON files directly, so disk storage stays minimal, and presenting a SQL interface to the data, with full support of nested structs so we can access fields from nested JSON objects without manipulating the files. Another benefit is that it gracefully handles heterogeneous record schemas, automatically nulling values that don't exist in particular rows." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "ConversionException", + "evalue": "Conversion Error: Malformed JSON at byte 0 of input: unexpected character. Input: ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mConversionException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 78\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;66;03m# # Get the actual SCV records\u001b[39;00m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# with gzip.open(scv_file, \"rt\") as f:\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;66;03m# for line in f:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;66;03m# for row in batch:\u001b[39;00m\n\u001b[1;32m 73\u001b[0m \u001b[38;5;66;03m# scvs.append(row)\u001b[39;00m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m scvs\n\u001b[0;32m---> 78\u001b[0m scvs \u001b[38;5;241m=\u001b[39m \u001b[43mquery_scvs_by_vrs_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvrs_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscv_file\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[7], line 43\u001b[0m, in \u001b[0;36mquery_scvs_by_vrs_id\u001b[0;34m(vrs_id, scv_file)\u001b[0m\n\u001b[1;32m 41\u001b[0m scv_ids \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 42\u001b[0m results \u001b[38;5;241m=\u001b[39m duckdb\u001b[38;5;241m.\u001b[39msql(sql)\n\u001b[0;32m---> 43\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m batch \u001b[38;5;241m:=\u001b[39m \u001b[43mresults\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetchmany\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m batch:\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28mid\u001b[39m, defining_context_id, member, rec \u001b[38;5;241m=\u001b[39m row\n", + "\u001b[0;31mConversionException\u001b[0m: Conversion Error: Malformed JSON at byte 0 of input: unexpected character. Input: ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo" + ] + } + ], + "source": [ + "################################\n", + "# Query the SCV file for the matching VRS ID using DuckDB as a query\n", + "# engine to obtain the list of SCVs we are interested in.\n", + "################################\n", + "\n", + "\n", + "def query_scvs_by_vrs_id(vrs_id: str, scv_file: str):\n", + " scvs = []\n", + "\n", + " sql = f\"\"\"\n", + " SELECT id, definingContext_id, member.id as member_id, a\n", + " FROM\n", + " (\n", + " SELECT\n", + " id,\n", + " variation.definingContext.id as definingContext_id,\n", + " unnest(variation.members) as member,\n", + " a\n", + " FROM read_json('{scv_file}', format='newline_delimited', ignore_errors=true) a\n", + " )\n", + " WHERE definingContext_id = '{vrs_id}'\n", + " OR member_id = '{vrs_id}'\n", + " \"\"\"\n", + "\n", + " sql = f\"\"\"\n", + " SELECT id, definingContext_id, member_id, a\n", + " FROM\n", + " (\n", + " SELECT\n", + " id,\n", + " variation.definingContext.id as definingContext_id,\n", + " unnest(json_extract(variation, '$.members[*].id')) as member_id,\n", + " a\n", + " FROM read_json('{scv_file}', format='newline_delimited', ignore_errors=true) a\n", + " )\n", + " WHERE definingContext_id = '{vrs_id}'\n", + " OR member_id = '{vrs_id}'\n", + " \"\"\"\n", + " # LIMIT 10\n", + "\n", + " scv_ids = []\n", + " results = duckdb.sql(sql)\n", + " while batch := results.fetchmany(100):\n", + " for row in batch:\n", + " id, defining_context_id, member, rec = row\n", + " d = {\"id\": id, \"definingContext_id\": defining_context_id, \"member\": member}\n", + " print(d)\n", + " scv_ids.append(row[0])\n", + " print(f\"Found {len(scv_ids)} SCVs for VRS id {vrs_id}\")\n", + " print(f\"SCV IDs: {scv_ids}\")\n", + "\n", + " # # Get the actual SCV records\n", + " # with gzip.open(scv_file, \"rt\") as f:\n", + " # for line in f:\n", + " # # Do simple string contains check before parsing the line as JSON.\n", + " # # With a small set of scv_ids, this is significantly faster than\n", + " # # parsing every line as JSON first.\n", + " # for scv_id in scv_ids:\n", + " # if scv_id in line:\n", + " # record = json.loads(line)\n", + " # if record[\"id\"] in scv_ids:\n", + " # scvs.append(record)\n", + "\n", + " # # Get the actual SCV records with a second duckdb query\n", + " # sql = f\"\"\"\n", + " # SELECT *\n", + " # FROM read_json('{scv_file}', format='newline_delimited', ignore_errors=true)\n", + " # WHERE id IN ({','.join([f\"'{id}'\" for id in scv_ids])})\n", + " # \"\"\"\n", + " # results = duckdb.sql(sql)\n", + " # while batch := results.fetchmany(100):\n", + " # for row in batch:\n", + " # scvs.append(row)\n", + "\n", + " return scvs\n", + "\n", + "\n", + "scvs = query_scvs_by_vrs_id(vrs_id, scv_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(scvs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for scv in scvs:\n", + " classification = scv[\"classification\"][\"label\"]\n", + " condition = scv[\"condition\"][\"label\"]\n", + " print(f\"SCV: {scv['id']} \")\n", + " print(f\" Classification: {classification}\")\n", + " print(f\" Condition: {condition}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/ASHG-clinvar-GKS-python.ipynb b/notebooks/ASHG-clinvar-GKS-python.ipynb new file mode 100644 index 0000000..03f6eb2 --- /dev/null +++ b/notebooks/ASHG-clinvar-GKS-python.ipynb @@ -0,0 +1,562 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from clinvar_gk_pilot.gcs import (\n", + " _local_file_path_for,\n", + " download_to_local_file,\n", + " already_downloaded,\n", + ")\n", + "\n", + "# variation_blob_uri = \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-variation-20240407.json.gz\"\n", + "# scv_blob_uri = (\n", + "# \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-scvs-20240407.json.gz\"\n", + "# )\n", + "\n", + "catvar_blob_uri = (\n", + " \"gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz\"\n", + ")\n", + "scv_blob_uri = \"gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz\"\n", + "\n", + "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "\n", + "\n", + "variation_local_file_path = _local_file_path_for(catvar_blob_uri)\n", + "if not already_downloaded(catvar_blob_uri):\n", + " print(f\"Downloading {catvar_blob_uri} to {variation_local_file_path}\")\n", + " dl_variation_local_file_path = download_to_local_file(catvar_blob_uri)\n", + " assert dl_variation_local_file_path == variation_local_file_path\n", + "\n", + "scv_local_file_path = _local_file_path_for(scv_blob_uri)\n", + "if not already_downloaded(scv_blob_uri):\n", + " print(f\"Downloading {scv_blob_uri} to {scv_local_file_path}\")\n", + " dl_scv_local_file_path = download_to_local_file(scv_blob_uri)\n", + " assert dl_scv_local_file_path == scv_local_file_path\n", + "\n", + "catvar_file = variation_local_file_path\n", + "scv_file = scv_local_file_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our ClinVar datasets are available as NDJSON files. There is a variation file and an SCV file. The records of the variation file are CategoricalVariation objects, and the records of the SCV file are `VariationPathogenicity` (sub-class of `Statement`)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import gzip\n", + "import json\n", + "\n", + "################################\n", + "# Query the SCV file for a VRS ID using vanilla Python\n", + "#\n", + "# - for a given ClinVar Variation ID, find the corresponding GA4GH CatVar record in the CatVar\n", + "# file and find the SCVs which reference that variant in the SCV file\n", + "#\n", + "# (NOTE: the SCV file also contains the full CatVar definition in the \"variation\" field, but\n", + "# this example illustrates how to query across both files, since the SCV file can be\n", + "# relationally normalized to extract that redundant entity and refer to the variant\n", + "# by the CatVar ID as a foreign key)\n", + "#\n", + "# - print the SCV interpretations for that variant\n", + "#\n", + "################################\n", + "################################\n", + "# Inputs\n", + "\n", + "################################\n", + "# A CanonicalAllele\n", + "## For searching based on the GKS Categorical Variation (CatVrs) ID\n", + "clinvar_id = \"563765\"\n", + "## For searching based on the GA4GH VRS Variation ID\n", + "vrs_id = \"ga4gh:VA.hf_va4AnlG99NuOjtaXJzh_XvszWWOO9\"\n", + "\n", + "\n", + "################################\n", + "# A CategoricalCnv\n", + "## For searching based on the GKS Categorical Variation (CatVrs) ID\n", + "clinvar_id = \"599353\"\n", + "## For searching based on the GA4GH VRS Variation ID\n", + "vrs_id = \"ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo\" # GRCh38 member\n", + "\n", + "\n", + "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "scv_file = \"combined-scv_output.ndjson.gz\"\n", + "################################\n", + "assert os.path.exists(catvar_file)\n", + "assert os.path.exists(scv_file)\n", + "catvar_id = f\"clinvar:{clinvar_id}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "################################\n", + "# Query the SCV file for the matching VRS ID\n", + "################################\n", + "\n", + "\n", + "def query_scvs_by_vrs_id(vrs_id: str, scv_file: str):\n", + " scvs = []\n", + " catvars = []\n", + " with gzip.open(scv_file, \"rt\") as f:\n", + " for line in f:\n", + " record = json.loads(line)\n", + " variation = record[\"variation\"]\n", + " processing_errors = [\n", + " e\n", + " for e in variation.get(\"extensions\", [])\n", + " if e[\"name\"] == \"vrs processing errors\"\n", + " ]\n", + " if len(processing_errors) > 0:\n", + " # print(f\"Skipping SCV record with VRS processing errors: {line}\")\n", + " continue\n", + "\n", + " match variation[\"type\"]:\n", + " case \"CategoricalCnv\":\n", + " if \"members\" not in variation:\n", + " # Unsupported?\n", + " # e.g. \"clinvar:1878325\"\n", + " # \"NC_000018.9:g.(48556994_48573289)_48573471dup\"\n", + " # raise ValueError(f\"CategoricalCnv missing members field: {line}\")\n", + " continue\n", + " members = variation[\"members\"]\n", + " member_vrs_ids = [m[\"id\"] for m in members]\n", + " if vrs_id in member_vrs_ids:\n", + " scvs.append(record)\n", + "\n", + " case \"CanonicalAllele\":\n", + " if \"definingContext\" not in variation:\n", + " # Unsupported allele type?\n", + " # e.g. clinvar:215984\n", + " # \"NM_000251.2(MSH2):c.212-?_366+?dup\"\n", + " # raise ValueError(f\"CanonicalAllele missing definingContext field: {line}\")\n", + " continue\n", + " if variation[\"definingContext\"][\"id\"] == vrs_id:\n", + " scvs.append(record)\n", + " case \"DescribedVariation\":\n", + " # not an error in processing, but does not have any VRS IDs\n", + " continue\n", + " # raise ValueError(f\"DescribedVariation not yet implemented: {line}\")\n", + " case _:\n", + " raise ValueError(f\"Unexpected variation type: {variation['type']}\")\n", + " return scvs\n", + "\n", + "\n", + "scvs = query_scvs_by_vrs_id(vrs_id, scv_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCV: SCV000864190.1 \n", + " Classification: Likely pathogenic\n", + " Condition: Squalene synthase deficiency\n", + "\n" + ] + } + ], + "source": [ + "for scv in scvs:\n", + " classification = scv[\"classification\"][\"label\"]\n", + " condition = scv[\"condition\"][\"label\"]\n", + " print(f\"SCV: {scv['id']} \")\n", + " print(f\" Classification: {classification}\")\n", + " print(f\" Condition: {condition}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "################################\n", + "# load the SCV ndjson into a sqlite database with a few indexes on the scv, catvar, and vrs IDs\n", + "################################\n", + "# # sqlite schema:\n", + "# # scv_id: str (index)\n", + "# # catvar_id: str (index)\n", + "# # vrs_id: str (index) # Can have multiple records if a record has multiple defining VRS IDs\n", + "# # value: the blob of the SCV record gziped and as bytes\n", + "\n", + "# import sqlite3\n", + "\n", + "# db_name = \"clinvar.db\"\n", + "# conn = sqlite3.connect(db_name)\n", + "\n", + "# # Create the schema and the indexes on columns\n", + "# c = conn.cursor()\n", + "# c.execute(\n", + "# \"\"\"\n", + "# CREATE TABLE IF NOT EXISTS scv (\n", + "# scv_id TEXT,\n", + "# catvar_id TEXT,\n", + "# vrs_id TEXT,\n", + "# value BLOB,\n", + "# )\n", + "# \"\"\"\n", + "# )\n", + "# c.execute(\"CREATE INDEX IF NOT EXISTS scv_id_index ON scv (scv_id)\")\n", + "# c.execute(\"CREATE INDEX IF NOT EXISTS catvar_id_index ON scv (catvar_id)\")\n", + "# c.execute(\"CREATE INDEX IF NOT EXISTS vrs_id_index ON scv (vrs_id)\")\n", + "# # Create a uniqueness constraint so these cannot be duplicated\n", + "# c.execute(\n", + "# \"CREATE UNIQUE INDEX IF NOT EXISTS scv_catvar_vrs_id_index ON scv (scv_id, catvar_id, vrs_id)\"\n", + "# )\n", + "# conn.commit()\n", + "\n", + "# with gzip.open(scv_file, \"rt\") as f:\n", + "# for line in f:\n", + "# record = json.loads(line)\n", + "\n", + "# variation = record[\"variation\"]\n", + "# processing_errors = [\n", + "# e\n", + "# for e in variation.get(\"extensions\", [])\n", + "# if e[\"name\"] == \"vrs processing errors\"\n", + "# ]\n", + "# if len(processing_errors) > 0:\n", + "# # print(\n", + "# # f\"Skipping SCV record with VRS processing errors: {json.dumps(record)}\"\n", + "# # )\n", + "# continue\n", + "\n", + "# match variation[\"type\"]:\n", + "# case \"CategoricalCnv\":\n", + "# if \"members\" not in variation:\n", + "# # Unsupported?\n", + "# # e.g. \"clinvar:1878325\"\n", + "# # \"NC_000018.9:g.(48556994_48573289)_48573471dup\"\n", + "# # raise ValueError(\n", + "# # f\"CategoricalCnv missing members field: {json.dumps(record)}\"\n", + "# # )\n", + "# continue\n", + "# members = variation[\"members\"]\n", + "# member_vrs_ids = [m[\"id\"] for m in members]\n", + "# if vrs_id in member_vrs_ids:\n", + "# scvs.append(record)\n", + "\n", + "# case \"CanonicalAllele\":\n", + "# if \"definingContext\" not in variation:\n", + "# # Unsupported allele type?\n", + "# # e.g. clinvar:215984\n", + "# # \"NM_000251.2(MSH2):c.212-?_366+?dup\"\n", + "# # raise ValueError(\n", + "# # f\"CanonicalAllele missing definingContext field: {json.dumps(record)}\"\n", + "# # )\n", + "# continue\n", + "# if variation[\"definingContext\"][\"id\"] == vrs_id:\n", + "# scvs.append(record)\n", + "# case \"DescribedVariation\":\n", + "# # do not have any VRS IDs\n", + "# continue\n", + "# # raise ValueError(\n", + "# # f\"DescribedVariation not yet implemented: {json.dumps(record)}\"\n", + "# # )\n", + "# case _:\n", + "# raise ValueError(f\"Unexpected variation type: {variation['type']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "While these can be read with vanilla python by iterating lines and parsing each as JSON, there are also libraries which can make querying the files simpler and potentially more performant.\n", + "\n", + "One option is DuckDB. DuckDB achieves fast speeds and low memory utilization by memory mapping files and dropping rows from memory that don't match filter criteria. It also has the benefit of being able to query GZIP-compressed NDJSON files directly, so disk storage stays minimal, and presenting a SQL interface to the data, with full support of nested structs so we can access fields from nested JSON objects without manipulating the files. Another benefit is that it gracefully handles heterogeneous record schemas, automatically nulling values that don't exist in particular rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another option to query the NDJSON files is the Polars library for Python.\n", + "\n", + "This library is similar to Pandas, but it is written in Rust, and it supports lazily querying files without loading all contents into memory.\n", + "\n", + "To read a whole NDJSON file, use `pandas.read_ndjson`. To read it lazily and only load into a dataframe the fields of rows that match applied filters, use `pandas.scan_ndjson` (as we do below)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "schema = pl.Schema({\"id\": pl.Utf8, \"definingContext\": pl.Struct({\"id\": pl.Utf8})})\n", + "\n", + "# Load the whole file into memory.\n", + "# NOT RECOMMENDED unless you have a very large amount of memory.\n", + "# Memory requirements include the data itself plus the overhead of the Python and Polars data structures.\n", + "# import gzip\n", + "# catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "# with gzip.open(catvar_file, \"rb\") as f:\n", + "# df = pl.read_ndjson(f, schema=schema)\n", + "\n", + "# Load file lazily. Recommended for large files.\n", + "catvar_file = \"combined-catvar_output.ndjson\"\n", + "catvar_df = pl.scan_ndjson(\n", + " catvar_file,\n", + " schema=schema,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.filter(pl.col(\"id\") == \"clinvar:209226\").collect()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scv_file = \"combined-scv_output.ndjson\"\n", + "# scv_schema = pl.Schema(\n", + "scv_df = pl.scan_ndjson(scv_file)\n", + "scv_df.head().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('SCV000244507.1', {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, {'id': 'SCV000244507.1', 'member': {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, 'classification': {'code': 'cg000001', 'label': 'Benign', 'system': 'https://dataexchange.clinicalgenome.org/codes/'}, 'condition': {'id': 'clinvarTrait:4711', 'label': 'Breast-ovarian cancer, familial, susceptibility to, 1', 'mappings': [{'coding': {'code': 'C2676676', 'system': 'https://www.ncbi.nlm.nih.gov/medgen/'}, 'relation': 'exactMatch'}], 'type': 'Disease', 'extensions': None, 'traits': None}, 'contributions': [{'activity': {'code': 'CRO_0000105', 'label': 'submitter role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 9, 29), 'label': 'Last Updated', 'type': 'Contribution'}, {'activity': {'code': 'CRO_0000105', 'label': 'submitter role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 9, 29), 'label': 'First in ClinVar', 'type': 'Contribution'}, {'activity': {'code': 'CRO_0000001', 'label': 'author role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 1, 12), 'label': 'Last Evaluated', 'type': 'Contribution'}], 'direction': 'refutes', 'extensions': [{'name': 'localKey', 'value': 'NM_007294.3:c.*3785T>A|OMIM:604370'}, {'name': 'clinvarMethodCategory', 'value': 'curation'}, {'name': 'submittedClassification', 'value': 'Benign'}, {'name': 'clinvarReviewStatus', 'value': 'reviewed by expert panel'}], 'id_1': 'SCV000244507.1', 'method': {'isReportedIn': {'pmid': None, 'type': 'Document', 'url': 'https://submit.ncbi.nlm.nih.gov/ft/byid/c2ffjci6/enigma_rules_2015-03-26.pdf', 'doi': None}, 'label': 'ENIGMA BRCA1/2 Classification Criteria (2015)', 'type': 'Method'}, 'predicate': 'isCausalFor', 'scv_id': 'SCV000244507', 'scv_ver': 1, 'strength': {'code': 'cg000101', 'label': 'definitive', 'system': 'https://dataexchange.clinicalgenome.org/codes/'}, 'type': 'VariationPathogenicity', 'variation': {'definingContext': {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '17-43041893-A-T'}, {'name': 'clinvar hgvs type', 'value': 'genomic, top-level'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': 43041893, 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': 43041892, 'type': 'SequenceLocation'}, 'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression', 'length': None, 'repeatSubunitLength': None}, 'type': 'Allele'}, 'extensions': [{'name': 'cytogenetic location', 'value': '17q21.31'}, {'name': 'clinvar variation type', 'value': 'single nucleotide variant'}, {'name': 'clinvar subclass type', 'value': 'SimpleAllele'}], 'id': 'clinvar:209226', 'label': 'NM_007294.3(BRCA1):c.*3785T>A', 'members': [{'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'NC_000017.10:g.41193910A>T'}, {'syntax': 'gnomad', 'value': '17-41193910-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-41193910-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'clinvar:209226-NC_000017.10', 'label': 'NC_000017.10:g.41193910A>T', 'location': {'digest': None, 'end': '41193910', 'id': None, 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh37'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.10', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '41193910', 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'NG_005905.2:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': '\"genomic\"'}], 'id': 'clinvar:209226-NG_005905.2', 'label': 'NG_005905.2:g.176091T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'NG_005905.2', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'LRG_292:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': '\"genomic\"'}, {'name': 'pre-processing vrs issue', 'value': '\"sequence for accession not supported by vrs-python release\"'}], 'id': 'clinvar:209226-LRG_292', 'label': 'LRG_292:g.176091T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'LRG_292', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.c', 'value': 'LRG_292t1:c.*3785T>A'}], 'extensions': [{'name': 'pre-processing vrs issue', 'value': '\"sequence for accession not supported by vrs-python release\"'}, {'name': 'clinvar hgvs type', 'value': '\"coding\"'}], 'id': 'clinvar:209226-LRG_292t1', 'label': 'LRG_292t1:c.*3785T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'LRG_292t1', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}], 'type': 'CanonicalAllele', 'mappings': None, 'copies': None, 'location': None, 'copyChange': None}, 'description': 'Class 1 not pathogenic based on frequency >1% in an outbred sampleset. Frequency 0.33 (Asian), 0.87 (African), 0.37 (European), derived from 1000 genomes (2012-04-30).', 'isReportedIn': None, 'qualifiers': None})\n" + ] + } + ], + "source": [ + "import gzip\n", + "import duckdb\n", + "\n", + "scv_file = \"combined-scv_output.ndjson.gz\"\n", + "vrs_id = \"ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF\"\n", + "# d = duckdb.connect()\n", + "query = f\"\"\"\n", + "SELECT id, member, a\n", + "FROM\n", + " (SELECT id, UNNEST(variation.members) AS member, s.*\n", + " FROM read_json('combined-scv_output.ndjson.gz', format='newline_delimited', ignore_errors=true) s\n", + " ) a\n", + "WHERE member.id = '{vrs_id}'\n", + "\n", + "LIMIT 10\n", + "\"\"\"\n", + "# AND a.variation.type = 'CategoricalCnv'\n", + "# WHERE (variation.type = 'CategoricalCnv')\n", + "# LIMIT 10\n", + "# AND member.id = '{vrs_id}'\n", + "\n", + "results = duckdb.query(query)\n", + "\n", + "scv_ids = []\n", + "while batch := results.fetchmany(100):\n", + " for row in batch:\n", + " print(row)\n", + " scv_ids.append(row[0])\n", + "\n", + "scv_ids = [row[0] for row in batch]\n", + "\n", + "# with gzip.open(scv_file) as f:\n", + "# for line in f:\n", + "# pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import gzip\n", + "# import json\n", + "# import psycopg2\n", + "# from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT\n", + "\n", + "# scv_file = \"combined-scv_output.ndjson.gz\"\n", + "\n", + "# # docker volume create clinvar_data\n", + "# # docker run --name mypostgres -e POSTGRES_PASSWORD=mysecretpassword -d -p 5432:5432 -v clinvar_data:/var/lib/postgresql/data postgres\n", + "\n", + "# # Connect to the default database\n", + "# conn = psycopg2.connect(\n", + "# host=\"localhost\", dbname=\"postgres\", user=\"postgres\", password=\"mysecretpassword\"\n", + "# )\n", + "# conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)\n", + "\n", + "# # Create a new database\n", + "# cur = conn.cursor()\n", + "# # cur.execute(\"DROP DATABASE IF EXISTS clinvar\")\n", + "# db_name = \"clinvar\"\n", + "# cur.execute(\"SELECT 1 FROM pg_catalog.pg_database WHERE datname = %s\", (db_name,))\n", + "# db_exists = cur.fetchone()\n", + "# if not db_exists:\n", + "# cur.execute(\"CREATE DATABASE IF NOT EXISTS clinvar\")\n", + "# cur.close()\n", + "# conn.close()\n", + "\n", + "# # with psycopg2.connect(\n", + "# # host=\"localhost\", dbname=\"postgres\", user=\"postgres\", password=\"mysecretpassword\"\n", + "# # ) as conn:\n", + "# # conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)\n", + "# # cur = conn.cursor()\n", + "# # cur.execute(\"CREATE DATABASE clinvar\")\n", + "\n", + "\n", + "# with psycopg2.connect(\n", + "# host=\"localhost\", dbname=\"clinvar\", user=\"postgres\", password=\"mysecretpassword\"\n", + "# ) as conn:\n", + "# cur = conn.cursor()\n", + "# cur.execute(\"DROP TABLE IF EXISTS scv\")\n", + "# cur.execute(\n", + "# \"\"\"\n", + "# CREATE TABLE IF NOT EXISTS scv (\n", + "# id TEXT PRIMARY KEY,\n", + "# value JSONB\n", + "# )\n", + "# \"\"\"\n", + "# )\n", + "# cur.execute(\n", + "# # Create a GIN index on value.definingContext.id\n", + "# \"\"\"\n", + "# CREATE INDEX IF NOT EXISTS scv_definingContext_id_index ON scv ((value->'definingContext'->>'id'))\n", + "# \"\"\"\n", + "# )\n", + "\n", + "# import time\n", + "\n", + "# start = time.time()\n", + "# last_printed = start\n", + "\n", + "\n", + "# def file_batch_lines(fileio, batch_size=10000):\n", + "# \"\"\"\n", + "# Generator function to read a file and yield batches of lines.\n", + "\n", + "# :param file_path: Path to the file to be read.\n", + "# :param batch_size: Number of lines per batch.\n", + "# \"\"\"\n", + "# batch = []\n", + "# for line in fileio:\n", + "# batch.append(line)\n", + "# if len(batch) == batch_size:\n", + "# yield batch\n", + "# batch = []\n", + "# if batch:\n", + "# yield batch\n", + "\n", + "\n", + "# with psycopg2.connect(\n", + "# host=\"localhost\", dbname=\"clinvar\", user=\"postgres\", password=\"mysecretpassword\"\n", + "# ) as conn:\n", + "# cur = conn.cursor()\n", + "# with gzip.open(scv_file, \"rt\") as f:\n", + "# idx = 0\n", + "# for line_batch in file_batch_lines(f, batch_size=2000):\n", + "# # mega SQL string strategy\n", + "# records = [json.loads(line) for line in line_batch]\n", + "# query_base = \"INSERT INTO scv (id, value) VALUES \"\n", + "# values_templ = \",\".join([\"(%s, %s)\"] * len(records))\n", + "# values = [(r[\"id\"], line) for r, line in zip(records, line_batch)]\n", + "# # flatten values by 1 level\n", + "# values = [v for sublist in values for v in sublist]\n", + "# query = query_base + values_templ\n", + "# cur.execute(query, values)\n", + "\n", + "# # executemany strategy\n", + "# # cur.executemany(\n", + "# # \"INSERT INTO scv (id, value) VALUES (%s, %s)\",\n", + "# # [(r[\"id\"], line) for r, line in zip(records, line_batch)],\n", + "# # )\n", + "\n", + "# # record = json.loads(line)\n", + "# # cur.execute(\n", + "# # \"INSERT INTO scv (id, value) VALUES (%s, %s)\",\n", + "# # (record[\"id\"], json.dumps(record)),\n", + "# # )\n", + "# idx += len(records)\n", + "# if time.time() - last_printed > 10:\n", + "# print(f\"Inserted {idx} records\")\n", + "# last_printed = time.time()\n", + "# cur.execute(\"COMMIT\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/ASHG-clinvar-GKS_DRAFT.ipynb b/notebooks/ASHG-clinvar-GKS_DRAFT.ipynb new file mode 100644 index 0000000..d0e4ab5 --- /dev/null +++ b/notebooks/ASHG-clinvar-GKS_DRAFT.ipynb @@ -0,0 +1,736 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from clinvar_gk_pilot.gcs import (\n", + " _local_file_path_for,\n", + " download_to_local_file,\n", + " already_downloaded,\n", + ")\n", + "\n", + "# variation_blob_uri = \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-variation-20240407.json.gz\"\n", + "# scv_blob_uri = (\n", + "# \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-scvs-20240407.json.gz\"\n", + "# )\n", + "\n", + "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "\n", + "\n", + "# variation_local_file_path = _local_file_path_for(variation_blob_uri)\n", + "# if not already_downloaded(variation_blob_uri):\n", + "# dl_variation_local_file_path = download_to_local_file(variation_blob_uri)\n", + "# assert dl_variation_local_file_path == variation_local_file_path\n", + "\n", + "# scv_local_file_path = _local_file_path_for(scv_blob_uri)\n", + "# if not already_downloaded(scv_blob_uri):\n", + "# dl_scv_local_file_path = download_to_local_file(scv_blob_uri)\n", + "# assert dl_scv_local_file_path == scv_local_file_path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our ClinVar datasets are available as NDJSON files. There is a variation file and an SCV file. The records of the variation file are CategoricalVariation objects, and the records of the SCV file are `VariationPathogenicity` (sub-class of `Statement`)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'definingContext': {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '17-43041893-A-T'}, {'name': 'clinvar hgvs type', 'value': 'genomic, top-level'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': 43041893, 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': 43041892, 'type': 'SequenceLocation'}, 'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression'}, 'type': 'Allele'}, 'extensions': [{'name': 'cytogenetic location', 'value': '17q21.31'}, {'name': 'clinvar variation type', 'value': 'single nucleotide variant'}, {'name': 'clinvar subclass type', 'value': 'SimpleAllele'}], 'id': 'clinvar:209226', 'label': 'NM_007294.3(BRCA1):c.*3785T>A', 'members': [{'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '17-43041893-A-T'}, {'name': 'clinvar hgvs type', 'value': 'genomic, top-level'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': 43041893, 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': 43041892, 'type': 'SequenceLocation'}, 'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression'}, 'type': 'Allele'}, {'expressions': [{'syntax': 'hgvs.g', 'value': 'NC_000017.10:g.41193910A>T'}, {'syntax': 'gnomad', 'value': '17-41193910-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '17-41193910-A-T'}, {'name': 'clinvar hgvs type', 'value': 'genomic, top-level'}], 'id': 'clinvar:209226-NC_000017.10', 'label': 'NC_000017.10:g.41193910A>T', 'location': {'end': 41193910, 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh37'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.10', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': 41193910, 'type': 'SequenceLocation'}, 'type': 'Allele'}, {'expressions': [{'syntax': 'hgvs.g', 'value': 'NG_005905.2:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': 'genomic'}], 'id': 'clinvar:209226-NG_005905.2', 'label': 'NG_005905.2:g.176091T>A', 'location': {'sequenceReference': {'id': 'NG_005905.2', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'type': 'SequenceLocation'}, 'type': 'Allele'}, {'expressions': [{'syntax': 'hgvs.g', 'value': 'LRG_292:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': 'genomic'}, {'name': 'pre-processing vrs issue', 'value': 'sequence for accession not supported by vrs-python release'}], 'id': 'clinvar:209226-LRG_292', 'label': 'LRG_292:g.176091T>A', 'location': {'sequenceReference': {'id': 'LRG_292', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'type': 'SequenceLocation'}, 'type': 'Allele'}, {'expressions': [{'syntax': 'hgvs.c', 'value': 'LRG_292t1:c.*3785T>A'}], 'extensions': [{'name': 'pre-processing vrs issue', 'value': 'sequence for accession not supported by vrs-python release'}, {'name': 'clinvar hgvs type', 'value': 'coding'}], 'id': 'clinvar:209226-LRG_292t1', 'label': 'LRG_292t1:c.*3785T>A', 'location': {'sequenceReference': {'id': 'LRG_292t1', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'type': 'SequenceLocation'}, 'type': 'Allele'}], 'type': 'CanonicalAllele'}\n", + "VRS variant ID: ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF\n" + ] + } + ], + "source": [ + "import os\n", + "import gzip\n", + "import json\n", + "\n", + "################################\n", + "# Query the files using vanilla Python\n", + "#\n", + "# - for a given ClinVar Variation ID, find the corresponding GA4GH CatVar record in the CatVar\n", + "# file and find the SCVs which reference that variant in the SCV file\n", + "#\n", + "# (NOTE: the SCV file also contains the full CatVar definition in the \"variation\" field, but\n", + "# this example illustrates how to query across both files, since the SCV file can be\n", + "# relationally normalized to extract that redundant entity and refer to the variant\n", + "# by the CatVar ID as a foreign key)\n", + "#\n", + "# - print the SCV interpretations for that variant\n", + "#\n", + "################################\n", + "################################\n", + "# Inputs\n", + "clinvar_id = \"209226\"\n", + "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "scv_file = \"combined-scv_output.ndjson.gz\"\n", + "################################\n", + "assert os.path.exists(catvar_file)\n", + "assert os.path.exists(scv_file)\n", + "catvar_id = f\"clinvar:{clinvar_id}\"\n", + "catvar = None\n", + "\n", + "with gzip.open(catvar_file, \"rt\") as f:\n", + " for line in f:\n", + " record = json.loads(line)\n", + " if record[\"id\"] == catvar_id:\n", + " print(record)\n", + " catvar = record\n", + " break\n", + "\n", + "assert catvar is not None\n", + "\n", + "vrs_id = catvar[\"definingContext\"][\"id\"]\n", + "print(f\"VRS variant ID: {vrs_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "################################\n", + "# Query the SCV file for the matching variant\n", + "################################\n", + "scvs = []\n", + "with gzip.open(scv_file, \"rt\") as f:\n", + " for line in f:\n", + " record = json.loads(line)\n", + "\n", + " variation = record[\"variation\"]\n", + " processing_errors = [\n", + " e\n", + " for e in variation.get(\"extensions\", [])\n", + " if e[\"name\"] == \"vrs processing errors\"\n", + " ]\n", + " if len(processing_errors) > 0:\n", + " # print(\n", + " # f\"Skipping SCV record with VRS processing errors: {json.dumps(record)}\"\n", + " # )\n", + " continue\n", + "\n", + " match variation[\"type\"]:\n", + " case \"CategoricalCnv\":\n", + " if \"members\" not in variation:\n", + " # Unsupported?\n", + " # e.g. \"clinvar:1878325\"\n", + " # \"NC_000018.9:g.(48556994_48573289)_48573471dup\"\n", + " # raise ValueError(\n", + " # f\"CategoricalCnv missing members field: {json.dumps(record)}\"\n", + " # )\n", + " continue\n", + " members = variation[\"members\"]\n", + " member_vrs_ids = [m[\"id\"] for m in members]\n", + " if vrs_id in member_vrs_ids:\n", + " scvs.append(record)\n", + "\n", + " case \"CanonicalAllele\":\n", + " if \"definingContext\" not in variation:\n", + " # Unsupported allele type?\n", + " # e.g. clinvar:215984\n", + " # \"NM_000251.2(MSH2):c.212-?_366+?dup\"\n", + " # raise ValueError(\n", + " # f\"CanonicalAllele missing definingContext field: {json.dumps(record)}\"\n", + " # )\n", + " continue\n", + " if variation[\"definingContext\"][\"id\"] == vrs_id:\n", + " scvs.append(record)\n", + " case \"DescribedVariation\":\n", + " # do not have any VRS IDs\n", + " continue\n", + " # raise ValueError(\n", + " # f\"DescribedVariation not yet implemented: {json.dumps(record)}\"\n", + " # )\n", + " case _:\n", + " raise ValueError(f\"Unexpected variation type: {variation['type']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCV: SCV000244507.1 \n", + " Classification: Benign\n", + " Condition: Breast-ovarian cancer, familial, susceptibility to, 1\n", + "\n" + ] + } + ], + "source": [ + "for scv in scvs:\n", + " classification = scv[\"classification\"][\"label\"]\n", + " condition = scv[\"condition\"][\"label\"]\n", + " print(f\"SCV: {scv['id']} \")\n", + " print(f\" Classification: {classification}\")\n", + " print(f\" Condition: {condition}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# sqlite schema:\n", + "# scv_id: str (index)\n", + "# catvar_id: str (index)\n", + "# vrs_id: str (index) # Can have multiple records if a record has multiple defining VRS IDs\n", + "# value: the blob of the SCV record gziped and as bytes\n", + "\n", + "import sqlite3\n", + "\n", + "db_name = \"clinvar.db\"\n", + "conn = sqlite3.connect(db_name)\n", + "\n", + "# Create the schema and the indexes on columns\n", + "c = conn.cursor()\n", + "c.execute(\n", + " \"\"\"\n", + " CREATE TABLE IF NOT EXISTS scv (\n", + " scv_id TEXT,\n", + " catvar_id TEXT,\n", + " vrs_id TEXT,\n", + " value BLOB,\n", + " )\n", + " \"\"\"\n", + ")\n", + "c.execute(\"CREATE INDEX IF NOT EXISTS scv_id_index ON scv (scv_id)\")\n", + "c.execute(\"CREATE INDEX IF NOT EXISTS catvar_id_index ON scv (catvar_id)\")\n", + "c.execute(\"CREATE INDEX IF NOT EXISTS vrs_id_index ON scv (vrs_id)\")\n", + "# Create a uniqueness constraint so these cannot be duplicated\n", + "c.execute(\n", + " \"CREATE UNIQUE INDEX IF NOT EXISTS scv_catvar_vrs_id_index ON scv (scv_id, catvar_id, vrs_id)\"\n", + ")\n", + "conn.commit()\n", + "\n", + "with gzip.open(scv_file, \"rt\") as f:\n", + " for line in f:\n", + " record = json.loads(line)\n", + "\n", + " variation = record[\"variation\"]\n", + " processing_errors = [\n", + " e\n", + " for e in variation.get(\"extensions\", [])\n", + " if e[\"name\"] == \"vrs processing errors\"\n", + " ]\n", + " if len(processing_errors) > 0:\n", + " # print(\n", + " # f\"Skipping SCV record with VRS processing errors: {json.dumps(record)}\"\n", + " # )\n", + " continue\n", + "\n", + " match variation[\"type\"]:\n", + " case \"CategoricalCnv\":\n", + " if \"members\" not in variation:\n", + " # Unsupported?\n", + " # e.g. \"clinvar:1878325\"\n", + " # \"NC_000018.9:g.(48556994_48573289)_48573471dup\"\n", + " # raise ValueError(\n", + " # f\"CategoricalCnv missing members field: {json.dumps(record)}\"\n", + " # )\n", + " continue\n", + " members = variation[\"members\"]\n", + " member_vrs_ids = [m[\"id\"] for m in members]\n", + " if vrs_id in member_vrs_ids:\n", + " scvs.append(record)\n", + "\n", + " case \"CanonicalAllele\":\n", + " if \"definingContext\" not in variation:\n", + " # Unsupported allele type?\n", + " # e.g. clinvar:215984\n", + " # \"NM_000251.2(MSH2):c.212-?_366+?dup\"\n", + " # raise ValueError(\n", + " # f\"CanonicalAllele missing definingContext field: {json.dumps(record)}\"\n", + " # )\n", + " continue\n", + " if variation[\"definingContext\"][\"id\"] == vrs_id:\n", + " scvs.append(record)\n", + " case \"DescribedVariation\":\n", + " # do not have any VRS IDs\n", + " continue\n", + " # raise ValueError(\n", + " # f\"DescribedVariation not yet implemented: {json.dumps(record)}\"\n", + " # )\n", + " case _:\n", + " raise ValueError(f\"Unexpected variation type: {variation['type']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "While these can be read with vanilla python by iterating lines and parsing each as JSON, there are also libraries which can make querying the files simpler and potentially more performant.\n", + "\n", + "One option is DuckDB. DuckDB achieves fast speeds and low memory utilization by memory mapping files and dropping rows from memory that don't match filter criteria. It also has the benefit of being able to query GZIP-compressed NDJSON files directly, so disk storage stays minimal, and presenting a SQL interface to the data, with full support of nested structs so we can access fields from nested JSON objects without manipulating the files. Another benefit is that it gracefully handles heterogeneous record schemas, automatically nulling values that don't exist in particular rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "clinvar_id = \"clinvar:209226\"\n", + "\n", + "sql = f\"\"\"\n", + "SELECT id, definingContext.id as vrs_id\n", + "FROM '{catvar_file}'\n", + "WHERE id = '{clinvar_id}'\n", + "LIMIT 10\n", + "\"\"\"\n", + "\n", + "\n", + "results = duckdb.sql(sql)\n", + "while batch := results.fetchmany(100):\n", + " for row in batch:\n", + " d = {\"id\": row[0], \"vrs_id\": row[1]}\n", + " print(d)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another option to query the NDJSON files is the Polars library for Python.\n", + "\n", + "This library is similar to Pandas, but it is written in Rust, and it supports lazily querying files without loading all contents into memory.\n", + "\n", + "To read a whole NDJSON file, use `pandas.read_ndjson`. To read it lazily and only load into a dataframe the fields of rows that match applied filters, use `pandas.scan_ndjson` (as we do below)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "schema = pl.Schema({\"id\": pl.Utf8, \"definingContext\": pl.Struct({\"id\": pl.Utf8})})\n", + "\n", + "# Load the whole file into memory.\n", + "# NOT RECOMMENDED unless you have a very large amount of memory.\n", + "# Memory requirements include the data itself plus the overhead of the Python and Polars data structures.\n", + "# import gzip\n", + "# catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "# with gzip.open(catvar_file, \"rb\") as f:\n", + "# df = pl.read_ndjson(f, schema=schema)\n", + "\n", + "# Load file lazily. Recommended for large files.\n", + "catvar_file = \"combined-catvar_output.ndjson\"\n", + "catvar_df = pl.scan_ndjson(\n", + " catvar_file,\n", + " schema=schema,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.filter(pl.col(\"id\") == \"clinvar:209226\").collect()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scv_file = \"combined-scv_output.ndjson\"\n", + "# scv_schema = pl.Schema(\n", + "scv_df = pl.scan_ndjson(scv_file)\n", + "scv_df.head().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('SCV000244507.1', {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, {'id': 'SCV000244507.1', 'member': {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, 'classification': {'code': 'cg000001', 'label': 'Benign', 'system': 'https://dataexchange.clinicalgenome.org/codes/'}, 'condition': {'id': 'clinvarTrait:4711', 'label': 'Breast-ovarian cancer, familial, susceptibility to, 1', 'mappings': [{'coding': {'code': 'C2676676', 'system': 'https://www.ncbi.nlm.nih.gov/medgen/'}, 'relation': 'exactMatch'}], 'type': 'Disease', 'extensions': None, 'traits': None}, 'contributions': [{'activity': {'code': 'CRO_0000105', 'label': 'submitter role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 9, 29), 'label': 'Last Updated', 'type': 'Contribution'}, {'activity': {'code': 'CRO_0000105', 'label': 'submitter role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 9, 29), 'label': 'First in ClinVar', 'type': 'Contribution'}, {'activity': {'code': 'CRO_0000001', 'label': 'author role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 1, 12), 'label': 'Last Evaluated', 'type': 'Contribution'}], 'direction': 'refutes', 'extensions': [{'name': 'localKey', 'value': 'NM_007294.3:c.*3785T>A|OMIM:604370'}, {'name': 'clinvarMethodCategory', 'value': 'curation'}, {'name': 'submittedClassification', 'value': 'Benign'}, {'name': 'clinvarReviewStatus', 'value': 'reviewed by expert panel'}], 'id_1': 'SCV000244507.1', 'method': {'isReportedIn': {'pmid': None, 'type': 'Document', 'url': 'https://submit.ncbi.nlm.nih.gov/ft/byid/c2ffjci6/enigma_rules_2015-03-26.pdf', 'doi': None}, 'label': 'ENIGMA BRCA1/2 Classification Criteria (2015)', 'type': 'Method'}, 'predicate': 'isCausalFor', 'scv_id': 'SCV000244507', 'scv_ver': 1, 'strength': {'code': 'cg000101', 'label': 'definitive', 'system': 'https://dataexchange.clinicalgenome.org/codes/'}, 'type': 'VariationPathogenicity', 'variation': {'definingContext': {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '17-43041893-A-T'}, {'name': 'clinvar hgvs type', 'value': 'genomic, top-level'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': 43041893, 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': 43041892, 'type': 'SequenceLocation'}, 'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression', 'length': None, 'repeatSubunitLength': None}, 'type': 'Allele'}, 'extensions': [{'name': 'cytogenetic location', 'value': '17q21.31'}, {'name': 'clinvar variation type', 'value': 'single nucleotide variant'}, {'name': 'clinvar subclass type', 'value': 'SimpleAllele'}], 'id': 'clinvar:209226', 'label': 'NM_007294.3(BRCA1):c.*3785T>A', 'members': [{'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'NC_000017.10:g.41193910A>T'}, {'syntax': 'gnomad', 'value': '17-41193910-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-41193910-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'clinvar:209226-NC_000017.10', 'label': 'NC_000017.10:g.41193910A>T', 'location': {'digest': None, 'end': '41193910', 'id': None, 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh37'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.10', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '41193910', 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'NG_005905.2:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': '\"genomic\"'}], 'id': 'clinvar:209226-NG_005905.2', 'label': 'NG_005905.2:g.176091T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'NG_005905.2', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'LRG_292:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': '\"genomic\"'}, {'name': 'pre-processing vrs issue', 'value': '\"sequence for accession not supported by vrs-python release\"'}], 'id': 'clinvar:209226-LRG_292', 'label': 'LRG_292:g.176091T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'LRG_292', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.c', 'value': 'LRG_292t1:c.*3785T>A'}], 'extensions': [{'name': 'pre-processing vrs issue', 'value': '\"sequence for accession not supported by vrs-python release\"'}, {'name': 'clinvar hgvs type', 'value': '\"coding\"'}], 'id': 'clinvar:209226-LRG_292t1', 'label': 'LRG_292t1:c.*3785T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'LRG_292t1', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}], 'type': 'CanonicalAllele', 'mappings': None, 'copies': None, 'location': None, 'copyChange': None}, 'description': 'Class 1 not pathogenic based on frequency >1% in an outbred sampleset. Frequency 0.33 (Asian), 0.87 (African), 0.37 (European), derived from 1000 genomes (2012-04-30).', 'isReportedIn': None, 'qualifiers': None})\n" + ] + } + ], + "source": [ + "import gzip\n", + "import duckdb\n", + "\n", + "scv_file = \"combined-scv_output.ndjson.gz\"\n", + "vrs_id = \"ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF\"\n", + "# d = duckdb.connect()\n", + "query = f\"\"\"\n", + "SELECT id, member, a\n", + "FROM\n", + " (SELECT id, UNNEST(variation.members) AS member, s.*\n", + " FROM read_json('combined-scv_output.ndjson.gz', format='newline_delimited', ignore_errors=true) s\n", + " ) a\n", + "WHERE member.id = '{vrs_id}'\n", + "\n", + "LIMIT 10\n", + "\"\"\"\n", + "# AND a.variation.type = 'CategoricalCnv'\n", + "# WHERE (variation.type = 'CategoricalCnv')\n", + "# LIMIT 10\n", + "# AND member.id = '{vrs_id}'\n", + "\n", + "results = duckdb.query(query)\n", + "\n", + "scv_ids = []\n", + "while batch := results.fetchmany(100):\n", + " for row in batch:\n", + " print(row)\n", + " scv_ids.append(row[0])\n", + "\n", + "scv_ids = [row[0] for row in batch]\n", + "\n", + "# with gzip.open(scv_file) as f:\n", + "# for line in f:\n", + "# pass" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inserted 6000 records\n", + "Inserted 16000 records\n", + "Inserted 28000 records\n", + "Inserted 40000 records\n", + "Inserted 54000 records\n", + "Inserted 68000 records\n", + "Inserted 82000 records\n", + "Inserted 98000 records\n", + "Inserted 114000 records\n", + "Inserted 124000 records\n", + "Inserted 138000 records\n", + "Inserted 154000 records\n", + "Inserted 168000 records\n", + "Inserted 184000 records\n", + "Inserted 200000 records\n", + "Inserted 214000 records\n", + "Inserted 230000 records\n", + "Inserted 246000 records\n", + "Inserted 258000 records\n", + "Inserted 274000 records\n", + "Inserted 290000 records\n", + "Inserted 306000 records\n", + "Inserted 320000 records\n", + "Inserted 334000 records\n", + "Inserted 350000 records\n", + "Inserted 364000 records\n", + "Inserted 380000 records\n", + "Inserted 392000 records\n", + "Inserted 408000 records\n", + "Inserted 422000 records\n", + "Inserted 438000 records\n", + "Inserted 454000 records\n", + "Inserted 468000 records\n", + "Inserted 482000 records\n", + "Inserted 496000 records\n", + "Inserted 510000 records\n", + "Inserted 526000 records\n", + "Inserted 540000 records\n", + "Inserted 556000 records\n", + "Inserted 570000 records\n", + "Inserted 586000 records\n", + "Inserted 602000 records\n", + "Inserted 616000 records\n", + "Inserted 632000 records\n", + "Inserted 646000 records\n", + "Inserted 660000 records\n", + "Inserted 676000 records\n", + "Inserted 692000 records\n", + "Inserted 708000 records\n", + "Inserted 722000 records\n", + "Inserted 738000 records\n", + "Inserted 752000 records\n", + "Inserted 768000 records\n", + "Inserted 784000 records\n", + "Inserted 800000 records\n", + "Inserted 816000 records\n", + "Inserted 834000 records\n", + "Inserted 850000 records\n", + "Inserted 864000 records\n", + "Inserted 880000 records\n", + "Inserted 896000 records\n", + "Inserted 912000 records\n", + "Inserted 930000 records\n", + "Inserted 946000 records\n", + "Inserted 962000 records\n", + "Inserted 978000 records\n", + "Inserted 994000 records\n", + "Inserted 1010000 records\n", + "Inserted 1026000 records\n", + "Inserted 1040000 records\n", + "Inserted 1056000 records\n", + "Inserted 1072000 records\n", + "Inserted 1088000 records\n", + "Inserted 1104000 records\n", + "Inserted 1118000 records\n", + "Inserted 1132000 records\n", + "Inserted 1150000 records\n", + "Inserted 1166000 records\n", + "Inserted 1182000 records\n", + "Inserted 1198000 records\n", + "Inserted 1212000 records\n", + "Inserted 1228000 records\n", + "Inserted 1244000 records\n", + "Inserted 1256000 records\n", + "Inserted 1272000 records\n", + "Inserted 1288000 records\n", + "Inserted 1292000 records\n", + "Inserted 1306000 records\n", + "Inserted 1322000 records\n", + "Inserted 1338000 records\n", + "Inserted 1354000 records\n", + "Inserted 1370000 records\n", + "Inserted 1386000 records\n", + "Inserted 1398000 records\n", + "Inserted 1412000 records\n", + "Inserted 1426000 records\n", + "Inserted 1442000 records\n", + "Inserted 1458000 records\n", + "Inserted 1474000 records\n", + "Inserted 1490000 records\n", + "Inserted 1506000 records\n", + "Inserted 1522000 records\n", + "Inserted 1538000 records\n", + "Inserted 1554000 records\n", + "Inserted 1570000 records\n", + "Inserted 1584000 records\n", + "Inserted 1600000 records\n", + "Inserted 1616000 records\n", + "Inserted 1632000 records\n", + "Inserted 1648000 records\n", + "Inserted 1664000 records\n", + "Inserted 1682000 records\n", + "Inserted 1700000 records\n", + "Inserted 1714000 records\n", + "Inserted 1726000 records\n", + "Inserted 1740000 records\n", + "Inserted 1756000 records\n", + "Inserted 1772000 records\n", + "Inserted 1788000 records\n", + "Inserted 1804000 records\n", + "Inserted 1820000 records\n", + "Inserted 1836000 records\n", + "Inserted 1850000 records\n", + "Inserted 1866000 records\n", + "Inserted 1882000 records\n", + "Inserted 1898000 records\n", + "Inserted 1910000 records\n", + "Inserted 1926000 records\n", + "Inserted 1942000 records\n", + "Inserted 1958000 records\n", + "Inserted 1974000 records\n", + "Inserted 1986000 records\n", + "Inserted 2000000 records\n", + "Inserted 2016000 records\n", + "Inserted 2032000 records\n", + "Inserted 2048000 records\n", + "Inserted 2064000 records\n", + "Inserted 2080000 records\n", + "Inserted 2096000 records\n", + "Inserted 2108000 records\n", + "Inserted 2122000 records\n", + "Inserted 2138000 records\n" + ] + }, + { + "ename": "DiskFull", + "evalue": "could not extend file \"base/696428/4298968.7\": No space left on device\nHINT: Check free disk space.\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mDiskFull\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 94\u001b[0m\n\u001b[1;32m 92\u001b[0m values \u001b[38;5;241m=\u001b[39m [v \u001b[38;5;28;01mfor\u001b[39;00m sublist \u001b[38;5;129;01min\u001b[39;00m values \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m sublist]\n\u001b[1;32m 93\u001b[0m query \u001b[38;5;241m=\u001b[39m query_base \u001b[38;5;241m+\u001b[39m values_templ\n\u001b[0;32m---> 94\u001b[0m \u001b[43mcur\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# executemany strategy\u001b[39;00m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;66;03m# cur.executemany(\u001b[39;00m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;66;03m# \"INSERT INTO scv (id, value) VALUES (%s, %s)\",\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;66;03m# (record[\"id\"], json.dumps(record)),\u001b[39;00m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[1;32m 107\u001b[0m idx \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(records)\n", + "\u001b[0;31mDiskFull\u001b[0m: could not extend file \"base/696428/4298968.7\": No space left on device\nHINT: Check free disk space.\n" + ] + } + ], + "source": [ + "import gzip\n", + "import json\n", + "import psycopg2\n", + "from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT\n", + "\n", + "scv_file = \"combined-scv_output.ndjson.gz\"\n", + "\n", + "# docker volume create clinvar_data\n", + "# docker run --name mypostgres -e POSTGRES_PASSWORD=mysecretpassword -d -p 5432:5432 -v clinvar_data:/var/lib/postgresql/data postgres\n", + "\n", + "# Connect to the default database\n", + "conn = psycopg2.connect(\n", + " host=\"localhost\", dbname=\"postgres\", user=\"postgres\", password=\"mysecretpassword\"\n", + ")\n", + "conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)\n", + "\n", + "# Create a new database\n", + "cur = conn.cursor()\n", + "# cur.execute(\"DROP DATABASE IF EXISTS clinvar\")\n", + "db_name = \"clinvar\"\n", + "cur.execute(\"SELECT 1 FROM pg_catalog.pg_database WHERE datname = %s\", (db_name,))\n", + "db_exists = cur.fetchone()\n", + "if not db_exists:\n", + " cur.execute(\"CREATE DATABASE IF NOT EXISTS clinvar\")\n", + "cur.close()\n", + "conn.close()\n", + "\n", + "# with psycopg2.connect(\n", + "# host=\"localhost\", dbname=\"postgres\", user=\"postgres\", password=\"mysecretpassword\"\n", + "# ) as conn:\n", + "# conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)\n", + "# cur = conn.cursor()\n", + "# cur.execute(\"CREATE DATABASE clinvar\")\n", + "\n", + "\n", + "with psycopg2.connect(\n", + " host=\"localhost\", dbname=\"clinvar\", user=\"postgres\", password=\"mysecretpassword\"\n", + ") as conn:\n", + " cur = conn.cursor()\n", + " cur.execute(\"DROP TABLE IF EXISTS scv\")\n", + " cur.execute(\n", + " \"\"\"\n", + " CREATE TABLE IF NOT EXISTS scv (\n", + " id TEXT PRIMARY KEY,\n", + " value JSONB\n", + " )\n", + " \"\"\"\n", + " )\n", + " cur.execute(\n", + " # Create a GIN index on value.definingContext.id\n", + " \"\"\"\n", + " CREATE INDEX IF NOT EXISTS scv_definingContext_id_index ON scv ((value->'definingContext'->>'id'))\n", + " \"\"\"\n", + " )\n", + "\n", + "import time\n", + "\n", + "start = time.time()\n", + "last_printed = start\n", + "\n", + "\n", + "def file_batch_lines(fileio, batch_size=10000):\n", + " \"\"\"\n", + " Generator function to read a file and yield batches of lines.\n", + "\n", + " :param file_path: Path to the file to be read.\n", + " :param batch_size: Number of lines per batch.\n", + " \"\"\"\n", + " batch = []\n", + " for line in fileio:\n", + " batch.append(line)\n", + " if len(batch) == batch_size:\n", + " yield batch\n", + " batch = []\n", + " if batch:\n", + " yield batch\n", + "\n", + "\n", + "with psycopg2.connect(\n", + " host=\"localhost\", dbname=\"clinvar\", user=\"postgres\", password=\"mysecretpassword\"\n", + ") as conn:\n", + " cur = conn.cursor()\n", + " with gzip.open(scv_file, \"rt\") as f:\n", + " idx = 0\n", + " for line_batch in file_batch_lines(f, batch_size=2000):\n", + " # mega SQL string strategy\n", + " records = [json.loads(line) for line in line_batch]\n", + " query_base = \"INSERT INTO scv (id, value) VALUES \"\n", + " values_templ = \",\".join([\"(%s, %s)\"] * len(records))\n", + " values = [(r[\"id\"], line) for r, line in zip(records, line_batch)]\n", + " # flatten values by 1 level\n", + " values = [v for sublist in values for v in sublist]\n", + " query = query_base + values_templ\n", + " cur.execute(query, values)\n", + "\n", + " # executemany strategy\n", + " # cur.executemany(\n", + " # \"INSERT INTO scv (id, value) VALUES (%s, %s)\",\n", + " # [(r[\"id\"], line) for r, line in zip(records, line_batch)],\n", + " # )\n", + "\n", + " # record = json.loads(line)\n", + " # cur.execute(\n", + " # \"INSERT INTO scv (id, value) VALUES (%s, %s)\",\n", + " # (record[\"id\"], json.dumps(record)),\n", + " # )\n", + " idx += len(records)\n", + " if time.time() - last_printed > 10:\n", + " print(f\"Inserted {idx} records\")\n", + " last_printed = time.time()\n", + " cur.execute(\"COMMIT\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1359b18d00f86c3d20acf29db74aa3544a54690c Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Sun, 14 Jul 2024 16:38:24 -0400 Subject: [PATCH 2/5] Add a list_blobs function and fix some pyright warnings in gcs.py --- clinvar_gk_pilot/gcs.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/clinvar_gk_pilot/gcs.py b/clinvar_gk_pilot/gcs.py index 727a92c..2496587 100644 --- a/clinvar_gk_pilot/gcs.py +++ b/clinvar_gk_pilot/gcs.py @@ -98,7 +98,7 @@ def blob_writer( def blob_reader( - blob_uri: str, client: storage.Client = None, binary=True + blob_uri: str, client: storage.Client | None = None, binary=True ) -> storage.Blob: """ Returns a file-like object that can be used to read from the blob at `blob_uri` @@ -109,7 +109,7 @@ def blob_reader( return blob.open("rb" if binary else "r") -def blob_size(blob_uri: str, client: storage.Client = None) -> int: +def blob_size(blob_uri: str, client: storage.Client | None = None) -> int: """ Returns the size of the blob in bytes if it exists. Raises an error if it does not. """ @@ -117,9 +117,24 @@ def blob_size(blob_uri: str, client: storage.Client = None) -> int: client = _get_gcs_client() blob = parse_blob_uri(blob_uri, client=client) blob.reload() # Refreshes local Blob object properties from the remote object + assert blob.exists() + assert blob.size is not None return blob.size +def list_blobs(bucket_name: str, prefix: str, client: storage.Client | None = None): + if client is None: + client = _get_gcs_client() + + bucket = client.get_bucket(bucket_name) + blobs = bucket.list_blobs(prefix=prefix) + + # Generate the list of blob URIs + blob_uris = [f"gs://{bucket_name}/{blob.name}" for blob in blobs] + + return blob_uris + + def http_download_requests( http_uri: str, local_path: PurePath, From 264235ef9a167bfd71362b9af18f2fca40251bbf Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Mon, 15 Jul 2024 00:50:39 -0400 Subject: [PATCH 3/5] Make log configuration work from notebooks that are not in the project root folder --- clinvar_gk_pilot/logger.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/clinvar_gk_pilot/logger.py b/clinvar_gk_pilot/logger.py index 30c8be5..e7010cd 100644 --- a/clinvar_gk_pilot/logger.py +++ b/clinvar_gk_pilot/logger.py @@ -1,7 +1,10 @@ import json import logging.config +from pathlib import Path -with open("log_conf.json", "r") as f: +PROJECT_ROOT = Path(__file__).resolve().parents[1] + +with open(PROJECT_ROOT / "log_conf.json", "r", encoding="utf-8") as f: conf = json.load(f) logging.config.dictConfig(conf) From d1195b3be05cdfa4c2b11eb3c236df774b3c3d08 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Mon, 15 Jul 2024 02:44:42 -0400 Subject: [PATCH 4/5] Trim down and improve layout of ashg-clinvar-gks-python notebook --- notebooks/ASHG-clinvar-GKS-python.ipynb | 470 ++++++------------------ 1 file changed, 113 insertions(+), 357 deletions(-) diff --git a/notebooks/ASHG-clinvar-GKS-python.ipynb b/notebooks/ASHG-clinvar-GKS-python.ipynb index 03f6eb2..0496a2d 100644 --- a/notebooks/ASHG-clinvar-GKS-python.ipynb +++ b/notebooks/ASHG-clinvar-GKS-python.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -12,45 +12,63 @@ " already_downloaded,\n", ")\n", "\n", - "# variation_blob_uri = \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-variation-20240407.json.gz\"\n", - "# scv_blob_uri = (\n", - "# \"gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-scvs-20240407.json.gz\"\n", - "# )\n", + "# Change the cwd from `notebooks` to the parent (the project root) so we download the files there\n", + "import os\n", + "\n", + "os.chdir(os.path.dirname(os.getcwd()))\n", "\n", "catvar_blob_uri = (\n", " \"gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz\"\n", ")\n", "scv_blob_uri = \"gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz\"\n", "\n", - "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", - "\n", "\n", "variation_local_file_path = _local_file_path_for(catvar_blob_uri)\n", "if not already_downloaded(catvar_blob_uri):\n", " print(f\"Downloading {catvar_blob_uri} to {variation_local_file_path}\")\n", " dl_variation_local_file_path = download_to_local_file(catvar_blob_uri)\n", - " assert dl_variation_local_file_path == variation_local_file_path\n", "\n", "scv_local_file_path = _local_file_path_for(scv_blob_uri)\n", "if not already_downloaded(scv_blob_uri):\n", " print(f\"Downloading {scv_blob_uri} to {scv_local_file_path}\")\n", " dl_scv_local_file_path = download_to_local_file(scv_blob_uri)\n", - " assert dl_scv_local_file_path == scv_local_file_path\n", + "\n", + "# catvar_file = \"combined-catvar_output.ndjson.gz\"\n", + "# scv_file = \"combined-scv_output.ndjson.gz\"\n", "\n", "catvar_file = variation_local_file_path\n", "scv_file = scv_local_file_path" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def print_scvs(scvs):\n", + " \"\"\"\n", + " Reused function to print some core fields from a GKS-modeled ClinVar SCV record.\n", + " \"\"\"\n", + " for scv in scvs:\n", + " classification = scv[\"classification\"][\"label\"]\n", + " condition = scv[\"condition\"][\"label\"]\n", + " print(f\"SCV: {scv['id']} \")\n", + " print(f\" Classification: {classification}\")\n", + " print(f\" Condition: {condition}\")\n", + " print()" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Our ClinVar datasets are available as NDJSON files. There is a variation file and an SCV file. The records of the variation file are CategoricalVariation objects, and the records of the SCV file are `VariationPathogenicity` (sub-class of `Statement`)" + "Our ClinVar datasets are available as both id-keyed JSON files, and NDJSON files. For each format there is a variation file and an SCV file. The demos in this notebook use the NDJSON formatted files. The records of the variation file are `CategoricalVariation` objects, and the records of the SCV file are `VariationPathogenicity` (sub-class of `Statement`)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -78,30 +96,28 @@ "################################\n", "# A CanonicalAllele\n", "## For searching based on the GKS Categorical Variation (CatVrs) ID\n", - "clinvar_id = \"563765\"\n", + "clinvar_id_canonicalallele = \"563765\"\n", + "catvar_id_canonicalallele = f\"clinvar:{clinvar_id_canonicalallele}\"\n", "## For searching based on the GA4GH VRS Variation ID\n", - "vrs_id = \"ga4gh:VA.hf_va4AnlG99NuOjtaXJzh_XvszWWOO9\"\n", + "vrs_id_canonicalallele = \"ga4gh:VA.hf_va4AnlG99NuOjtaXJzh_XvszWWOO9\"\n", "\n", "\n", "################################\n", "# A CategoricalCnv\n", "## For searching based on the GKS Categorical Variation (CatVrs) ID\n", - "clinvar_id = \"599353\"\n", + "clinvar_id_categoricalcnv = \"599353\"\n", + "catvar_id_categoricalcnv = f\"clinvar:{clinvar_id_categoricalcnv}\"\n", "## For searching based on the GA4GH VRS Variation ID\n", - "vrs_id = \"ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo\" # GRCh38 member\n", - "\n", + "vrs_id_categoricalcnv = \"ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo\" # GRCh38 member\n", "\n", - "catvar_file = \"combined-catvar_output.ndjson.gz\"\n", - "scv_file = \"combined-scv_output.ndjson.gz\"\n", "################################\n", "assert os.path.exists(catvar_file)\n", - "assert os.path.exists(scv_file)\n", - "catvar_id = f\"clinvar:{clinvar_id}\"" + "assert os.path.exists(scv_file)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -110,10 +126,10 @@ "################################\n", "\n", "\n", - "def query_scvs_by_vrs_id(vrs_id: str, scv_file: str):\n", + "def query_scvs_by_vrs_id(vrs_id: str, scv_file_name: str):\n", " scvs = []\n", - " catvars = []\n", - " with gzip.open(scv_file, \"rt\") as f:\n", + " # catvars = []\n", + " with gzip.open(scv_file_name, \"rt\") as f:\n", " for line in f:\n", " record = json.loads(line)\n", " variation = record[\"variation\"]\n", @@ -153,389 +169,129 @@ " continue\n", " # raise ValueError(f\"DescribedVariation not yet implemented: {line}\")\n", " case _:\n", - " raise ValueError(f\"Unexpected variation type: {variation['type']}\")\n", - " return scvs\n", - "\n", - "\n", - "scvs = query_scvs_by_vrs_id(vrs_id, scv_file)" + " raise ValueError(\n", + " f\"Unexpected variation type ({variation['type']}): {line}\"\n", + " )\n", + " return scvs" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SCV: SCV000864190.1 \n", - " Classification: Likely pathogenic\n", - " Condition: Squalene synthase deficiency\n", - "\n" - ] - } - ], - "source": [ - "for scv in scvs:\n", - " classification = scv[\"classification\"][\"label\"]\n", - " condition = scv[\"condition\"][\"label\"]\n", - " print(f\"SCV: {scv['id']} \")\n", - " print(f\" Classification: {classification}\")\n", - " print(f\" Condition: {condition}\")\n", - " print()" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "################################\n", - "# load the SCV ndjson into a sqlite database with a few indexes on the scv, catvar, and vrs IDs\n", + "# Query the SCV file for the matching CatVar ID\n", "################################\n", - "# # sqlite schema:\n", - "# # scv_id: str (index)\n", - "# # catvar_id: str (index)\n", - "# # vrs_id: str (index) # Can have multiple records if a record has multiple defining VRS IDs\n", - "# # value: the blob of the SCV record gziped and as bytes\n", - "\n", - "# import sqlite3\n", - "\n", - "# db_name = \"clinvar.db\"\n", - "# conn = sqlite3.connect(db_name)\n", "\n", - "# # Create the schema and the indexes on columns\n", - "# c = conn.cursor()\n", - "# c.execute(\n", - "# \"\"\"\n", - "# CREATE TABLE IF NOT EXISTS scv (\n", - "# scv_id TEXT,\n", - "# catvar_id TEXT,\n", - "# vrs_id TEXT,\n", - "# value BLOB,\n", - "# )\n", - "# \"\"\"\n", - "# )\n", - "# c.execute(\"CREATE INDEX IF NOT EXISTS scv_id_index ON scv (scv_id)\")\n", - "# c.execute(\"CREATE INDEX IF NOT EXISTS catvar_id_index ON scv (catvar_id)\")\n", - "# c.execute(\"CREATE INDEX IF NOT EXISTS vrs_id_index ON scv (vrs_id)\")\n", - "# # Create a uniqueness constraint so these cannot be duplicated\n", - "# c.execute(\n", - "# \"CREATE UNIQUE INDEX IF NOT EXISTS scv_catvar_vrs_id_index ON scv (scv_id, catvar_id, vrs_id)\"\n", - "# )\n", - "# conn.commit()\n", "\n", - "# with gzip.open(scv_file, \"rt\") as f:\n", - "# for line in f:\n", - "# record = json.loads(line)\n", - "\n", - "# variation = record[\"variation\"]\n", - "# processing_errors = [\n", - "# e\n", - "# for e in variation.get(\"extensions\", [])\n", - "# if e[\"name\"] == \"vrs processing errors\"\n", - "# ]\n", - "# if len(processing_errors) > 0:\n", - "# # print(\n", - "# # f\"Skipping SCV record with VRS processing errors: {json.dumps(record)}\"\n", - "# # )\n", - "# continue\n", - "\n", - "# match variation[\"type\"]:\n", - "# case \"CategoricalCnv\":\n", - "# if \"members\" not in variation:\n", - "# # Unsupported?\n", - "# # e.g. \"clinvar:1878325\"\n", - "# # \"NC_000018.9:g.(48556994_48573289)_48573471dup\"\n", - "# # raise ValueError(\n", - "# # f\"CategoricalCnv missing members field: {json.dumps(record)}\"\n", - "# # )\n", - "# continue\n", - "# members = variation[\"members\"]\n", - "# member_vrs_ids = [m[\"id\"] for m in members]\n", - "# if vrs_id in member_vrs_ids:\n", - "# scvs.append(record)\n", - "\n", - "# case \"CanonicalAllele\":\n", - "# if \"definingContext\" not in variation:\n", - "# # Unsupported allele type?\n", - "# # e.g. clinvar:215984\n", - "# # \"NM_000251.2(MSH2):c.212-?_366+?dup\"\n", - "# # raise ValueError(\n", - "# # f\"CanonicalAllele missing definingContext field: {json.dumps(record)}\"\n", - "# # )\n", - "# continue\n", - "# if variation[\"definingContext\"][\"id\"] == vrs_id:\n", - "# scvs.append(record)\n", - "# case \"DescribedVariation\":\n", - "# # do not have any VRS IDs\n", - "# continue\n", - "# # raise ValueError(\n", - "# # f\"DescribedVariation not yet implemented: {json.dumps(record)}\"\n", - "# # )\n", - "# case _:\n", - "# raise ValueError(f\"Unexpected variation type: {variation['type']}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "def query_scvs_by_catvar_id(catvar_id: str, scv_file_name: str):\n", + " scvs = []\n", + " # catvars = []\n", + " with gzip.open(scv_file_name, \"rt\") as f:\n", + " for line in f:\n", + " record = json.loads(line)\n", + " variation = record[\"variation\"]\n", + " record_catvar_id = variation[\"id\"]\n", "\n", - "While these can be read with vanilla python by iterating lines and parsing each as JSON, there are also libraries which can make querying the files simpler and potentially more performant.\n", + " if record_catvar_id == catvar_id:\n", + " scvs.append(record)\n", "\n", - "One option is DuckDB. DuckDB achieves fast speeds and low memory utilization by memory mapping files and dropping rows from memory that don't match filter criteria. It also has the benefit of being able to query GZIP-compressed NDJSON files directly, so disk storage stays minimal, and presenting a SQL interface to the data, with full support of nested structs so we can access fields from nested JSON objects without manipulating the files. Another benefit is that it gracefully handles heterogeneous record schemas, automatically nulling values that don't exist in particular rows." + " return scvs" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", + "execution_count": 6, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCV: SCV004334569.1 \n", + " Classification: Likely benign\n", + " Condition: BAP1-related tumor predisposition syndrome\n", + "\n" + ] + } + ], "source": [ - "Another option to query the NDJSON files is the Polars library for Python.\n", - "\n", - "This library is similar to Pandas, but it is written in Rust, and it supports lazily querying files without loading all contents into memory.\n", + "scvs_by_vrs_id_canonicalallele = query_scvs_by_vrs_id(vrs_id_canonicalallele, scv_file)\n", "\n", - "To read a whole NDJSON file, use `pandas.read_ndjson`. To read it lazily and only load into a dataframe the fields of rows that match applied filters, use `pandas.scan_ndjson` (as we do below)." + "print_scvs(scvs_by_vrs_id_canonicalallele)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCV: SCV000864190.1 \n", + " Classification: Likely pathogenic\n", + " Condition: Squalene synthase deficiency\n", + "\n" + ] + } + ], "source": [ - "import polars as pl\n", + "scvs_by_vrs_id_categoricalcnv = query_scvs_by_vrs_id(vrs_id_categoricalcnv, scv_file)\n", "\n", - "schema = pl.Schema({\"id\": pl.Utf8, \"definingContext\": pl.Struct({\"id\": pl.Utf8})})\n", - "\n", - "# Load the whole file into memory.\n", - "# NOT RECOMMENDED unless you have a very large amount of memory.\n", - "# Memory requirements include the data itself plus the overhead of the Python and Polars data structures.\n", - "# import gzip\n", - "# catvar_file = \"combined-catvar_output.ndjson.gz\"\n", - "# with gzip.open(catvar_file, \"rb\") as f:\n", - "# df = pl.read_ndjson(f, schema=schema)\n", - "\n", - "# Load file lazily. Recommended for large files.\n", - "catvar_file = \"combined-catvar_output.ndjson\"\n", - "catvar_df = pl.scan_ndjson(\n", - " catvar_file,\n", - " schema=schema,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.filter(pl.col(\"id\") == \"clinvar:209226\").collect()" + "print_scvs(scvs_by_vrs_id_categoricalcnv)" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scv_file = \"combined-scv_output.ndjson\"\n", - "# scv_schema = pl.Schema(\n", - "scv_df = pl.scan_ndjson(scv_file)\n", - "scv_df.head().collect()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "('SCV000244507.1', {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, {'id': 'SCV000244507.1', 'member': {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, 'classification': {'code': 'cg000001', 'label': 'Benign', 'system': 'https://dataexchange.clinicalgenome.org/codes/'}, 'condition': {'id': 'clinvarTrait:4711', 'label': 'Breast-ovarian cancer, familial, susceptibility to, 1', 'mappings': [{'coding': {'code': 'C2676676', 'system': 'https://www.ncbi.nlm.nih.gov/medgen/'}, 'relation': 'exactMatch'}], 'type': 'Disease', 'extensions': None, 'traits': None}, 'contributions': [{'activity': {'code': 'CRO_0000105', 'label': 'submitter role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 9, 29), 'label': 'Last Updated', 'type': 'Contribution'}, {'activity': {'code': 'CRO_0000105', 'label': 'submitter role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 9, 29), 'label': 'First in ClinVar', 'type': 'Contribution'}, {'activity': {'code': 'CRO_0000001', 'label': 'author role', 'system': 'http://purl.obolibrary.org/obo/'}, 'agent': {'id': 'clinvar.submitter:504863', 'label': 'Evidence-based Network for the Interpretation of Germline Mutant Alleles (ENIGMA)', 'type': 'Agent'}, 'date': datetime.date(2015, 1, 12), 'label': 'Last Evaluated', 'type': 'Contribution'}], 'direction': 'refutes', 'extensions': [{'name': 'localKey', 'value': 'NM_007294.3:c.*3785T>A|OMIM:604370'}, {'name': 'clinvarMethodCategory', 'value': 'curation'}, {'name': 'submittedClassification', 'value': 'Benign'}, {'name': 'clinvarReviewStatus', 'value': 'reviewed by expert panel'}], 'id_1': 'SCV000244507.1', 'method': {'isReportedIn': {'pmid': None, 'type': 'Document', 'url': 'https://submit.ncbi.nlm.nih.gov/ft/byid/c2ffjci6/enigma_rules_2015-03-26.pdf', 'doi': None}, 'label': 'ENIGMA BRCA1/2 Classification Criteria (2015)', 'type': 'Method'}, 'predicate': 'isCausalFor', 'scv_id': 'SCV000244507', 'scv_ver': 1, 'strength': {'code': 'cg000101', 'label': 'definitive', 'system': 'https://dataexchange.clinicalgenome.org/codes/'}, 'type': 'VariationPathogenicity', 'variation': {'definingContext': {'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '17-43041893-A-T'}, {'name': 'clinvar hgvs type', 'value': 'genomic, top-level'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': 43041893, 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': 43041892, 'type': 'SequenceLocation'}, 'state': {'sequence': 'T', 'type': 'LiteralSequenceExpression', 'length': None, 'repeatSubunitLength': None}, 'type': 'Allele'}, 'extensions': [{'name': 'cytogenetic location', 'value': '17q21.31'}, {'name': 'clinvar variation type', 'value': 'single nucleotide variant'}, {'name': 'clinvar subclass type', 'value': 'SimpleAllele'}], 'id': 'clinvar:209226', 'label': 'NM_007294.3(BRCA1):c.*3785T>A', 'members': [{'digest': 'iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'expressions': [{'syntax': 'spdi', 'value': 'NC_000017.11:43041892:A:T'}, {'syntax': 'hgvs.g', 'value': 'NC_000017.11:g.43041893A>T'}, {'syntax': 'gnomad', 'value': '17-43041893-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-43041893-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF', 'label': 'NC_000017.11:43041892:A:T', 'location': {'digest': 'zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'end': '43041893', 'id': 'ga4gh:SL.zdH_x64Ewl4jnJSjPQrb9ENIPb8GZOG-', 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh38'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.11', 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '43041892', 'type': 'SequenceLocation'}, 'state': '{\"sequence\":\"T\",\"type\":\"LiteralSequenceExpression\"}', 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'NC_000017.10:g.41193910A>T'}, {'syntax': 'gnomad', 'value': '17-41193910-A-T'}], 'extensions': [{'name': 'clinvar vcf', 'value': '\"17-41193910-A-T\"'}, {'name': 'clinvar hgvs type', 'value': '\"genomic, top-level\"'}], 'id': 'clinvar:209226-NC_000017.10', 'label': 'NC_000017.10:g.41193910A>T', 'location': {'digest': None, 'end': '41193910', 'id': None, 'sequenceReference': {'extensions': [{'name': 'assembly', 'value': 'GRCh37'}, {'name': 'chromosome', 'value': '17'}], 'id': 'NC_000017.10', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': '41193910', 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'NG_005905.2:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': '\"genomic\"'}], 'id': 'clinvar:209226-NG_005905.2', 'label': 'NG_005905.2:g.176091T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'NG_005905.2', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.g', 'value': 'LRG_292:g.176091T>A'}], 'extensions': [{'name': 'clinvar hgvs type', 'value': '\"genomic\"'}, {'name': 'pre-processing vrs issue', 'value': '\"sequence for accession not supported by vrs-python release\"'}], 'id': 'clinvar:209226-LRG_292', 'label': 'LRG_292:g.176091T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'LRG_292', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}, {'digest': None, 'expressions': [{'syntax': 'hgvs.c', 'value': 'LRG_292t1:c.*3785T>A'}], 'extensions': [{'name': 'pre-processing vrs issue', 'value': '\"sequence for accession not supported by vrs-python release\"'}, {'name': 'clinvar hgvs type', 'value': '\"coding\"'}], 'id': 'clinvar:209226-LRG_292t1', 'label': 'LRG_292t1:c.*3785T>A', 'location': {'digest': None, 'end': None, 'id': None, 'sequenceReference': {'extensions': None, 'id': 'LRG_292t1', 'refgetAccession': None, 'residueAlphabet': 'na', 'type': 'SequenceReference'}, 'start': None, 'type': 'SequenceLocation'}, 'state': None, 'type': 'Allele', 'copies': None, 'copyChange': None}], 'type': 'CanonicalAllele', 'mappings': None, 'copies': None, 'location': None, 'copyChange': None}, 'description': 'Class 1 not pathogenic based on frequency >1% in an outbred sampleset. Frequency 0.33 (Asian), 0.87 (African), 0.37 (European), derived from 1000 genomes (2012-04-30).', 'isReportedIn': None, 'qualifiers': None})\n" + "SCV: SCV000810762.1 \n", + " Classification: Pathogenic\n", + " Condition: not provided\n", + "\n" ] } ], "source": [ - "import gzip\n", - "import duckdb\n", - "\n", - "scv_file = \"combined-scv_output.ndjson.gz\"\n", - "vrs_id = \"ga4gh:VA.iTe7Ew7If4vdPnhfcT2qXQQY-tNu7tcF\"\n", - "# d = duckdb.connect()\n", - "query = f\"\"\"\n", - "SELECT id, member, a\n", - "FROM\n", - " (SELECT id, UNNEST(variation.members) AS member, s.*\n", - " FROM read_json('combined-scv_output.ndjson.gz', format='newline_delimited', ignore_errors=true) s\n", - " ) a\n", - "WHERE member.id = '{vrs_id}'\n", - "\n", - "LIMIT 10\n", - "\"\"\"\n", - "# AND a.variation.type = 'CategoricalCnv'\n", - "# WHERE (variation.type = 'CategoricalCnv')\n", - "# LIMIT 10\n", - "# AND member.id = '{vrs_id}'\n", - "\n", - "results = duckdb.query(query)\n", - "\n", - "scv_ids = []\n", - "while batch := results.fetchmany(100):\n", - " for row in batch:\n", - " print(row)\n", - " scv_ids.append(row[0])\n", - "\n", - "scv_ids = [row[0] for row in batch]\n", + "scvs_by_catvar_id_canonicalallele = query_scvs_by_catvar_id(\n", + " catvar_id_canonicalallele, scv_file\n", + ")\n", "\n", - "# with gzip.open(scv_file) as f:\n", - "# for line in f:\n", - "# pass" + "print_scvs(scvs_by_catvar_id_canonicalallele)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCV: SCV000864190.1 \n", + " Classification: Likely pathogenic\n", + " Condition: Squalene synthase deficiency\n", + "\n" + ] + } + ], "source": [ - "# import gzip\n", - "# import json\n", - "# import psycopg2\n", - "# from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT\n", - "\n", - "# scv_file = \"combined-scv_output.ndjson.gz\"\n", - "\n", - "# # docker volume create clinvar_data\n", - "# # docker run --name mypostgres -e POSTGRES_PASSWORD=mysecretpassword -d -p 5432:5432 -v clinvar_data:/var/lib/postgresql/data postgres\n", - "\n", - "# # Connect to the default database\n", - "# conn = psycopg2.connect(\n", - "# host=\"localhost\", dbname=\"postgres\", user=\"postgres\", password=\"mysecretpassword\"\n", - "# )\n", - "# conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)\n", - "\n", - "# # Create a new database\n", - "# cur = conn.cursor()\n", - "# # cur.execute(\"DROP DATABASE IF EXISTS clinvar\")\n", - "# db_name = \"clinvar\"\n", - "# cur.execute(\"SELECT 1 FROM pg_catalog.pg_database WHERE datname = %s\", (db_name,))\n", - "# db_exists = cur.fetchone()\n", - "# if not db_exists:\n", - "# cur.execute(\"CREATE DATABASE IF NOT EXISTS clinvar\")\n", - "# cur.close()\n", - "# conn.close()\n", - "\n", - "# # with psycopg2.connect(\n", - "# # host=\"localhost\", dbname=\"postgres\", user=\"postgres\", password=\"mysecretpassword\"\n", - "# # ) as conn:\n", - "# # conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)\n", - "# # cur = conn.cursor()\n", - "# # cur.execute(\"CREATE DATABASE clinvar\")\n", - "\n", - "\n", - "# with psycopg2.connect(\n", - "# host=\"localhost\", dbname=\"clinvar\", user=\"postgres\", password=\"mysecretpassword\"\n", - "# ) as conn:\n", - "# cur = conn.cursor()\n", - "# cur.execute(\"DROP TABLE IF EXISTS scv\")\n", - "# cur.execute(\n", - "# \"\"\"\n", - "# CREATE TABLE IF NOT EXISTS scv (\n", - "# id TEXT PRIMARY KEY,\n", - "# value JSONB\n", - "# )\n", - "# \"\"\"\n", - "# )\n", - "# cur.execute(\n", - "# # Create a GIN index on value.definingContext.id\n", - "# \"\"\"\n", - "# CREATE INDEX IF NOT EXISTS scv_definingContext_id_index ON scv ((value->'definingContext'->>'id'))\n", - "# \"\"\"\n", - "# )\n", - "\n", - "# import time\n", - "\n", - "# start = time.time()\n", - "# last_printed = start\n", - "\n", - "\n", - "# def file_batch_lines(fileio, batch_size=10000):\n", - "# \"\"\"\n", - "# Generator function to read a file and yield batches of lines.\n", - "\n", - "# :param file_path: Path to the file to be read.\n", - "# :param batch_size: Number of lines per batch.\n", - "# \"\"\"\n", - "# batch = []\n", - "# for line in fileio:\n", - "# batch.append(line)\n", - "# if len(batch) == batch_size:\n", - "# yield batch\n", - "# batch = []\n", - "# if batch:\n", - "# yield batch\n", - "\n", - "\n", - "# with psycopg2.connect(\n", - "# host=\"localhost\", dbname=\"clinvar\", user=\"postgres\", password=\"mysecretpassword\"\n", - "# ) as conn:\n", - "# cur = conn.cursor()\n", - "# with gzip.open(scv_file, \"rt\") as f:\n", - "# idx = 0\n", - "# for line_batch in file_batch_lines(f, batch_size=2000):\n", - "# # mega SQL string strategy\n", - "# records = [json.loads(line) for line in line_batch]\n", - "# query_base = \"INSERT INTO scv (id, value) VALUES \"\n", - "# values_templ = \",\".join([\"(%s, %s)\"] * len(records))\n", - "# values = [(r[\"id\"], line) for r, line in zip(records, line_batch)]\n", - "# # flatten values by 1 level\n", - "# values = [v for sublist in values for v in sublist]\n", - "# query = query_base + values_templ\n", - "# cur.execute(query, values)\n", - "\n", - "# # executemany strategy\n", - "# # cur.executemany(\n", - "# # \"INSERT INTO scv (id, value) VALUES (%s, %s)\",\n", - "# # [(r[\"id\"], line) for r, line in zip(records, line_batch)],\n", - "# # )\n", + "scvs_by_catvar_id_categoricalcnv = query_scvs_by_catvar_id(\n", + " catvar_id_categoricalcnv, scv_file\n", + ")\n", "\n", - "# # record = json.loads(line)\n", - "# # cur.execute(\n", - "# # \"INSERT INTO scv (id, value) VALUES (%s, %s)\",\n", - "# # (record[\"id\"], json.dumps(record)),\n", - "# # )\n", - "# idx += len(records)\n", - "# if time.time() - last_printed > 10:\n", - "# print(f\"Inserted {idx} records\")\n", - "# last_printed = time.time()\n", - "# cur.execute(\"COMMIT\")" + "print_scvs(scvs_by_catvar_id_categoricalcnv)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 858b8dd5311a737156f9eea5c45d8dda8e812fa4 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Mon, 15 Jul 2024 13:33:10 -0400 Subject: [PATCH 5/5] Fix canonicalallele catvar id in notebook --- notebooks/ASHG-clinvar-GKS-python.ipynb | 39 +++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/notebooks/ASHG-clinvar-GKS-python.ipynb b/notebooks/ASHG-clinvar-GKS-python.ipynb index 0496a2d..17895e0 100644 --- a/notebooks/ASHG-clinvar-GKS-python.ipynb +++ b/notebooks/ASHG-clinvar-GKS-python.ipynb @@ -4,7 +4,36 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-15 12:28:21 - gcs - INFO - Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-15 12:30:27 - gcs - INFO - Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz\n" + ] + } + ], "source": [ "from clinvar_gk_pilot.gcs import (\n", " _local_file_path_for,\n", @@ -96,7 +125,7 @@ "################################\n", "# A CanonicalAllele\n", "## For searching based on the GKS Categorical Variation (CatVrs) ID\n", - "clinvar_id_canonicalallele = \"563765\"\n", + "clinvar_id_canonicalallele = \"2769522\"\n", "catvar_id_canonicalallele = f\"clinvar:{clinvar_id_canonicalallele}\"\n", "## For searching based on the GA4GH VRS Variation ID\n", "vrs_id_canonicalallele = \"ga4gh:VA.hf_va4AnlG99NuOjtaXJzh_XvszWWOO9\"\n", @@ -254,9 +283,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "SCV: SCV000810762.1 \n", - " Classification: Pathogenic\n", - " Condition: not provided\n", + "SCV: SCV004334569.1 \n", + " Classification: Likely benign\n", + " Condition: BAP1-related tumor predisposition syndrome\n", "\n" ] }