Skip to content

Commit

Permalink
Merge pull request #18 from nrminor/wf_wrapper
Browse files Browse the repository at this point in the history
Abandoning flyte and using nextflow as orchestrator
  • Loading branch information
nrminor authored Feb 29, 2024
2 parents 8c49cd6 + 96749c1 commit 95abf28
Show file tree
Hide file tree
Showing 8 changed files with 458 additions and 56 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ __pycache__/
.Python
.venv/
env/
bin/
build/
develop-eggs/
dist/
Expand Down Expand Up @@ -96,6 +95,12 @@ scratch/
# llm files
.llm*

# nextflow files
.nextflow/
.nextflow*
work/
results/

scratch*
sandbox*
data/
Expand Down
91 changes: 91 additions & 0 deletions bin/check_duplicate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3

import sys
import json


def deduplicate_in_clusters(gene: str) -> str:
"""
TODO
"""

# Load LDLR_test.json
with open(gene + "_variants_extracted.json", "r", encoding="utf8") as file:
ldlr_test_data = json.load(file)

# Load LDLR_clusters.json
with open(gene + "_clusters.json", "r", encoding="utf8") as file:
ldlr_clusters_data = json.load(file)

# Step 2: Organize LDLR_clusters_data for efficient searching
# Exclude clusters with id: "-1" and create a mapping of content to cluster id

content_to_cluster_id = {}
for cluster in ldlr_clusters_data:
if cluster["id"] != "-1":
for item in cluster["items"]:
content_to_cluster_id[item["content"].lower()] = cluster["id"]

# Step 3: Process LDLR_test_data to find records meeting the criteria
# For each variant_id, we will check if 2 or more text fields can be found in any content fields of clusters

# Organize LDLR_test_data by variant_id and collect texts
variant_id_to_texts = {}
for record in ldlr_test_data:
variant_id = record["variant_id"]
text = record["condition"]["text"].lower()
if variant_id not in variant_id_to_texts:
variant_id_to_texts[variant_id] = []
variant_id_to_texts[variant_id].append(text)

# Find qualifying records
qualifying_records = []
total_reduced_records = 0
counter_unique_variant_id_duplicate = (
0 # count the number of unique variant_id with similar disease names
)
for variant_id, texts in variant_id_to_texts.items():
# Count texts found in any content fields of clusters
found_texts_count = sum(1 for text in texts if text in content_to_cluster_id)

# If 2 or more texts found, add all records with this variant_id to qualifying_records
if found_texts_count >= 2:
qualifying_records.extend(
[
record
for record in ldlr_test_data
if record["variant_id"] == variant_id
]
)
total_reduced_records += found_texts_count - 1
counter_unique_variant_id_duplicate = (
counter_unique_variant_id_duplicate + 1
)

print(
"\nNumber of RCV records that can be potentially reduced: ",
total_reduced_records,
)
print("Total nmber of RCV records: ", len(ldlr_test_data))

print(
"\nNumber of variants with groupable disease names: ",
counter_unique_variant_id_duplicate,
)
print(
"Number of unique variants: ", len(variant_id_to_texts)
) # number of unique variant_id
print()

# Step 4: Save the qualifying records into a new JSON file

output_file_path = gene + "_qualifying_records.json"
with open(output_file_path, "w", encoding="utf8") as outfile:
json.dump(qualifying_records, outfile, indent=4)

return output_file_path


if __name__ == "__main__":
GENE = sys.argv[1]
deduplicate_in_clusters(GENE)
63 changes: 56 additions & 7 deletions ...-landrum-song/pull_and_extract_clinvar.py → bin/pull_and_extract_clinvar.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,34 @@
import urllib
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Dict, List, Tuple
from typing import Dict, List, Optional, Tuple
from xml.etree.ElementTree import ElementTree

from Bio import Entrez
from rich import print as rprint


def parse_command_line_args() -> Tuple[Path, str]:
def parse_command_line_args() -> Tuple[Path, Optional[str], str]:
"""
Parse command line arguments, returning a file that lists genes of
interest and an email for NCBI tracking purposes.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--genelist",
"-g",
"-G",
type=Path,
required=True,
required=False,
help="Text file with one, headerless column of one gene name per line.",
)
parser.add_argument(
"--gene",
"-g",
type=str,
required=False,
default=None,
help="Single gene to query ClinVar for.",
)
parser.add_argument(
"--email",
"-e",
Expand All @@ -54,7 +62,7 @@ def parse_command_line_args() -> Tuple[Path, str]:
)
args = parser.parse_args()

return args.genelist, args.email
return args.genelist, args.gene, args.email


async def pull_clinvar_data(
Expand Down Expand Up @@ -188,6 +196,39 @@ async def save_variant_json(gene: str, data) -> str:
return json_filename


async def save_unique_conditions(gene: str, data) -> str:
"""
Save the unique set of conditions for the given gene as a JSON file
to be used downstream.
"""

# Extract unique disease names
unique_disease_names = {item["condition"]["text"] for item in data}

# Save unique disease names to JSON
unique_disease_json_filename = gene + "_unique_diseases.json"

with open(unique_disease_json_filename, "w", encoding="utf-8") as json_file:
# Convert the set to a list for JSON serialization
json.dump(list(unique_disease_names), json_file, ensure_ascii=False, indent=4)
print(f"Saved unique disease names to {unique_disease_json_filename}")

# Assigning an ID to each unique condition and formatting the data
formatted_conditions = [
{"cid": idx + 1, "condition": condition_text}
for idx, condition_text in enumerate(unique_disease_names)
]

# Saving the formatted unique condition texts to a JSON file
json_filename = gene + "_formatted_unique_conditions.json"
with open(json_filename, "w", encoding="utf-8") as json_file:
json.dump(formatted_conditions, json_file, ensure_ascii=False, indent=4)

print(f"Saved formatted unique condition texts to {json_filename}")

return json_filename


async def main() -> None:
"""
Coordinate the flow of data through the above functions within an
Expand All @@ -196,7 +237,14 @@ async def main() -> None:
"""

# Example usage parameters
gene_file, email = parse_command_line_args()
gene_file, gene, email = parse_command_line_args()

if gene is not None:
extracted_data = await pull_and_extract_data(gene, email)
# _ = await save_variant_xml(gene, extracted_data)
_ = await save_variant_json(gene, extracted_data)
_ = await save_unique_conditions(gene, extracted_data)
return

# collect the list of genes
with open(gene_file, "r", encoding="utf8") as input_handle:
Expand All @@ -205,8 +253,9 @@ async def main() -> None:
# process each gene asynchronously
for gene in genes:
extracted_data = await pull_and_extract_data(gene, email)
_ = await save_variant_xml(gene, extracted_data)
# _ = await save_variant_xml(gene, extracted_data)
_ = await save_variant_json(gene, extracted_data)
_ = await save_unique_conditions(gene, extracted_data)
rprint(f"Data retrieval, extraction, and writing complete for {gene}")


Expand Down
Loading

0 comments on commit 95abf28

Please sign in to comment.