Merge pull request #18 from nrminor/wf_wrapper

Abandoning flyte and using nextflow as orchestrator
NCBI-Codeathons · Feb 29, 2024 · 95abf28 · 95abf28
2 parents 8c49cd6 + 96749c1
commit 95abf28
Show file tree

Hide file tree

Showing 8 changed files with 458 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,7 +13,6 @@ __pycache__/
 .Python
 .venv/
 env/
-bin/
 build/
 develop-eggs/
 dist/
@@ -96,6 +95,12 @@ scratch/
 # llm files
 .llm*
 
+# nextflow files
+.nextflow/
+.nextflow*
+work/
+results/
+
 scratch*
 sandbox*
 data/

diff --git a/bin/check_duplicate.py b/bin/check_duplicate.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+
+def deduplicate_in_clusters(gene: str) -> str:
+    """
+    TODO
+    """
+
+    # Load LDLR_test.json
+    with open(gene + "_variants_extracted.json", "r", encoding="utf8") as file:
+        ldlr_test_data = json.load(file)
+
+    # Load LDLR_clusters.json
+    with open(gene + "_clusters.json", "r", encoding="utf8") as file:
+        ldlr_clusters_data = json.load(file)
+
+    # Step 2: Organize LDLR_clusters_data for efficient searching
+    # Exclude clusters with id: "-1" and create a mapping of content to cluster id
+
+    content_to_cluster_id = {}
+    for cluster in ldlr_clusters_data:
+        if cluster["id"] != "-1":
+            for item in cluster["items"]:
+                content_to_cluster_id[item["content"].lower()] = cluster["id"]
+
+    # Step 3: Process LDLR_test_data to find records meeting the criteria
+    # For each variant_id, we will check if 2 or more text fields can be found in any content fields of clusters
+
+    # Organize LDLR_test_data by variant_id and collect texts
+    variant_id_to_texts = {}
+    for record in ldlr_test_data:
+        variant_id = record["variant_id"]
+        text = record["condition"]["text"].lower()
+        if variant_id not in variant_id_to_texts:
+            variant_id_to_texts[variant_id] = []
+        variant_id_to_texts[variant_id].append(text)
+
+    # Find qualifying records
+    qualifying_records = []
+    total_reduced_records = 0
+    counter_unique_variant_id_duplicate = (
+        0  # count the number of unique variant_id with similar disease names
+    )
+    for variant_id, texts in variant_id_to_texts.items():
+        # Count texts found in any content fields of clusters
+        found_texts_count = sum(1 for text in texts if text in content_to_cluster_id)
+
+        # If 2 or more texts found, add all records with this variant_id to qualifying_records
+        if found_texts_count >= 2:
+            qualifying_records.extend(
+                [
+                    record
+                    for record in ldlr_test_data
+                    if record["variant_id"] == variant_id
+                ]
+            )
+            total_reduced_records += found_texts_count - 1
+            counter_unique_variant_id_duplicate = (
+                counter_unique_variant_id_duplicate + 1
+            )
+
+    print(
+        "\nNumber of RCV records that can be potentially reduced: ",
+        total_reduced_records,
+    )
+    print("Total nmber of RCV records: ", len(ldlr_test_data))
+
+    print(
+        "\nNumber of variants with groupable disease names: ",
+        counter_unique_variant_id_duplicate,
+    )
+    print(
+        "Number of unique variants: ", len(variant_id_to_texts)
+    )  # number of unique variant_id
+    print()
+
+    # Step 4: Save the qualifying records into a new JSON file
+
+    output_file_path = gene + "_qualifying_records.json"
+    with open(output_file_path, "w", encoding="utf8") as outfile:
+        json.dump(qualifying_records, outfile, indent=4)
+
+    return output_file_path
+
+
+if __name__ == "__main__":
+    GENE = sys.argv[1]
+    deduplicate_in_clusters(GENE)
diff --git a/...-landrum-song/pull_and_extract_clinvar.py → bin/pull_and_extract_clinvar.py b/...-landrum-song/pull_and_extract_clinvar.py → bin/pull_and_extract_clinvar.py
@@ -25,26 +25,34 @@
 import urllib
 import xml.etree.ElementTree as ET
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 from xml.etree.ElementTree import ElementTree
 
 from Bio import Entrez
 from rich import print as rprint
 
 
-def parse_command_line_args() -> Tuple[Path, str]:
+def parse_command_line_args() -> Tuple[Path, Optional[str], str]:
     """
     Parse command line arguments, returning a file that lists genes of
     interest and an email for NCBI tracking purposes.
     """
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--genelist",
-        "-g",
+        "-G",
         type=Path,
-        required=True,
+        required=False,
         help="Text file with one, headerless column of one gene name per line.",
     )
+    parser.add_argument(
+        "--gene",
+        "-g",
+        type=str,
+        required=False,
+        default=None,
+        help="Single gene to query ClinVar for.",
+    )
     parser.add_argument(
         "--email",
         "-e",
@@ -54,7 +62,7 @@ def parse_command_line_args() -> Tuple[Path, str]:
     )
     args = parser.parse_args()
 
-    return args.genelist, args.email
+    return args.genelist, args.gene, args.email
 
 
 async def pull_clinvar_data(
@@ -188,6 +196,39 @@ async def save_variant_json(gene: str, data) -> str:
     return json_filename
 
 
+async def save_unique_conditions(gene: str, data) -> str:
+    """
+    Save the unique set of conditions for the given gene as a JSON file
+    to be used downstream.
+    """
+
+    # Extract unique disease names
+    unique_disease_names = {item["condition"]["text"] for item in data}
+
+    # Save unique disease names to JSON
+    unique_disease_json_filename = gene + "_unique_diseases.json"
+
+    with open(unique_disease_json_filename, "w", encoding="utf-8") as json_file:
+        # Convert the set to a list for JSON serialization
+        json.dump(list(unique_disease_names), json_file, ensure_ascii=False, indent=4)
+    print(f"Saved unique disease names to {unique_disease_json_filename}")
+
+    # Assigning an ID to each unique condition and formatting the data
+    formatted_conditions = [
+        {"cid": idx + 1, "condition": condition_text}
+        for idx, condition_text in enumerate(unique_disease_names)
+    ]
+
+    # Saving the formatted unique condition texts to a JSON file
+    json_filename = gene + "_formatted_unique_conditions.json"
+    with open(json_filename, "w", encoding="utf-8") as json_file:
+        json.dump(formatted_conditions, json_file, ensure_ascii=False, indent=4)
+
+    print(f"Saved formatted unique condition texts to {json_filename}")
+
+    return json_filename
+
+
 async def main() -> None:
     """
     Coordinate the flow of data through the above functions within an
@@ -196,7 +237,14 @@ async def main() -> None:
     """
 
     # Example usage parameters
-    gene_file, email = parse_command_line_args()
+    gene_file, gene, email = parse_command_line_args()
+
+    if gene is not None:
+        extracted_data = await pull_and_extract_data(gene, email)
+        # _ = await save_variant_xml(gene, extracted_data)
+        _ = await save_variant_json(gene, extracted_data)
+        _ = await save_unique_conditions(gene, extracted_data)
+        return
 
     # collect the list of genes
     with open(gene_file, "r", encoding="utf8") as input_handle:
@@ -205,8 +253,9 @@ async def main() -> None:
     # process each gene asynchronously
     for gene in genes:
         extracted_data = await pull_and_extract_data(gene, email)
-        _ = await save_variant_xml(gene, extracted_data)
+        # _ = await save_variant_xml(gene, extracted_data)
         _ = await save_variant_json(gene, extracted_data)
+        _ = await save_unique_conditions(gene, extracted_data)
         rprint(f"Data retrieval, extraction, and writing complete for {gene}")