From ef6b2f7dcb60a52e929011820c056cfa28ca0d76 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Thu, 7 Nov 2024 23:39:32 +0000 Subject: [PATCH] Remove lineage imputation components Now in the sc2ts-paper repo --- notebooks/lineage-imputation.ipynb | 494 ----------------------------- sc2ts/lineages.py | 393 ----------------------- sc2ts/utils.py | 57 ---- 3 files changed, 944 deletions(-) delete mode 100644 notebooks/lineage-imputation.ipynb diff --git a/notebooks/lineage-imputation.ipynb b/notebooks/lineage-imputation.ipynb deleted file mode 100644 index 4520e51..0000000 --- a/notebooks/lineage-imputation.ipynb +++ /dev/null @@ -1,494 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "498996a6", - "metadata": {}, - "source": [ - "# Imputing lineages for reconstructed internal nodes" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "06dc368e", - "metadata": {}, - "outputs": [], - "source": [ - "import tskit\n", - "import tszip\n", - "import pandas as pd\n", - "import tqdm\n", - "\n", - "import sys\n", - "sys.path.append(\"../\")\n", - "import sc2ts.utils\n", - "import sc2ts.lineages" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "26f1cf95", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Counting descendants : 100%|███████████████████████████████████████| 783231/783231 [00:00<00:00, 3460493.66it/s]\n", - "Indexing metadata : 100%|█████████████████████████████████████████| 783231/783231 [00:08<00:00, 93027.58it/s]\n", - "Classifying mutations: 100%|██████████████████████████████████████| 1062072/1062072 [00:07<00:00, 142610.22it/s]\n", - "Counting descendants : 100%|█████████████████████████████████████| 1453347/1453347 [00:00<00:00, 3336969.57it/s]\n", - "Indexing metadata : 100%|███████████████████████████████████████| 1453347/1453347 [00:16<00:00, 87669.72it/s]\n", - "Classifying mutations: 100%|██████████████████████████████████████| 1213193/1213193 [00:08<00:00, 141461.46it/s]\n" - ] - } - ], - "source": [ - "ts_long_path = \"../../sc2ts_ts/upgma-mds-1000-md-30-mm-3-2022-06-30-recinfo\"\n", - "ts_wide_path = \"../../sc2ts_ts/upgma-full-md-30-mm-3-2021-06-30-recinfo\"\n", - "ts_long = tszip.decompress(ts_long_path + \"-il.ts.tsz\")\n", - "ts_wide = tszip.decompress(ts_wide_path + \"-il.ts.tsz\")\n", - "ti_long = sc2ts.utils.TreeInfo(ts_long)\n", - "ti_wide = sc2ts.utils.TreeInfo(ts_wide)\n", - "mutations_json_filepath = \"../../sc2ts_ts/consensus_mutations.json\"\n", - "gisaid_metadata_filepath = \"../../sc2ts_ts/metadata_tsv_2023_03_09/metadata.tsv\"" - ] - }, - { - "cell_type": "markdown", - "id": "4367b112", - "metadata": {}, - "source": [ - "# GISAID vs Nextclade lineage comparison" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "77546779", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/6m/05k8jk1s03q36gn2syqp87m80000gs/T/ipykernel_2688/3177304272.py:1: DtypeWarning: Columns (18) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " md = pd.read_table(gisaid_metadata_filepath)\n" - ] - } - ], - "source": [ - "md = pd.read_table(gisaid_metadata_filepath)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c1c93bfc", - "metadata": {}, - "outputs": [], - "source": [ - "gisaid_data = [(x,y) for x, y in zip(md['Accession ID'], md['Pango lineage'])]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d3004341", - "metadata": {}, - "outputs": [], - "source": [ - "linmuts_dict = sc2ts.lineages.read_in_mutations(mutations_json_filepath)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "cb061202", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|███████████████████████████████████████████████████████████| 15115274/15115274 [00:15<00:00, 982823.61it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ts number of samples: 657239\n", - "number matched to gisaid data: 657168\n", - "number of differences: 46311\n", - "proportion: 0.0704705646044847\n", - "Filling in missing GISAID lineages with Nextclade lineages: 185\n" - ] - } - ], - "source": [ - "ts_long_gisaid = sc2ts.utils.check_lineages(\n", - " ts_long,\n", - " ti_long,\n", - " gisaid_data,\n", - " linmuts_dict,\n", - " diff_filehandle='../../sc2ts_ts/lineage_disagreement_long',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "1f9f99dc", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|███████████████████████████████████████████████████████████| 15115274/15115274 [00:21<00:00, 715844.46it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ts number of samples: 1265685\n", - "number matched to gisaid data: 1265683\n", - "number of differences: 65677\n", - "proportion: 0.05189056027457112\n", - "Filling in missing GISAID lineages with Nextclade lineages: 0\n" - ] - } - ], - "source": [ - "ts_wide_gisaid = sc2ts.utils.check_lineages(\n", - " ts_wide,\n", - " ti_wide,\n", - " gisaid_data,\n", - " linmuts_dict,\n", - " diff_filehandle='../../sc2ts_ts/lineage_disagreement_wide',\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "bd69bb4f", - "metadata": {}, - "source": [ - "# ts lineage imputation" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "3930d1da", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recording relevant mutations for each node...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cee29a41d6a04870a7e1c5c35e153b6a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1062072 [00:00 0: - X = X.iloc[0:ind] - X_index = X_index[0:ind] - y = clf_tree.predict(ohe_encoder.transform(X)) - inferred_lineages.add_imputed_values(X_index, y) - pbar.update(inferred_lineages.change) - - -def add_lineages_to_ts(il, ts): - """ - Adds imputed lineages to ts metadata. - """ - imputed_lineages = il.get_results() - tables = ts.tables - new_metadata = [] - for node in ts.nodes(): - md = node.metadata - if "Imputed_lineage" in md: - md.pop("Imputed_lineage") - md["Imputed_" + il.true_lineage] = imputed_lineages[node.id] - new_metadata.append(md) - validated_metadata = [ - tables.nodes.metadata_schema.validate_and_encode_row(row) - for row in new_metadata - ] - tables.nodes.packset_metadata(validated_metadata) - edited_ts = tables.tree_sequence() - return edited_ts diff --git a/sc2ts/utils.py b/sc2ts/utils.py index 353da7a..1eee1c4 100644 --- a/sc2ts/utils.py +++ b/sc2ts/utils.py @@ -14,7 +14,6 @@ import numpy as np import pandas as pd -import sklearn import tqdm import matplotlib.pyplot as plt from matplotlib import colors @@ -754,62 +753,6 @@ def sample_subgraph(sample_node, ts, ti=None, **kwargs): return plot_subgraph(nodes, ts, ti, **kwargs) -def imputation_setup(filepath, verbose=False): - """ - Reads in JSON of lineage-defining mutations and constructs decision tree classifier - JSON can be downloaded from covidcg.org -> 'Compare AA mutations' -> Download -> 'Consensus mutations' - (setting mutation type to 'NT' and consensus threshold to 0.9) - """ - linmuts_dict = lineages.read_in_mutations(filepath) - df, df_ohe, ohe = lineages.read_in_mutations_json(filepath) - - # Get decision tree - y = df_ohe.index # lineage labels - clf = sklearn.tree.DecisionTreeClassifier() - clf = clf.fit(df_ohe, y) - - if verbose: - # Check tree works and that lineages-defining mutations are unique for each lineage - y_pred = clf.predict(df_ohe) - correct = incorrect = lineage_definition_issue = 0 - for yy, yy_pred in zip(y, y_pred): - if yy == yy_pred: - correct += 1 - else: - incorrect += 1 - if linmuts_dict.get_mutations(yy) == linmuts_dict.get_mutations( - yy_pred - ): - lineage_definition_issue += 1 - print(yy_pred, "same mutations as", yy) - print( - "Correct:", - correct, - "incorrect:", - incorrect, - "of which due to lineage definition ambiguity:", - lineage_definition_issue, - ) - - return linmuts_dict, df, df_ohe, ohe, clf - - -def lineage_imputation(filepath, ts, ti, internal_only=False, verbose=False): - """ - Runs lineage imputation on input ts - """ - linmuts_dict, df, df_ohe, ohe, clf = imputation_setup(filepath, verbose) - print("Recording relevant mutations for each node...") - node_to_mut_dict = lineages.get_node_to_mut_dict(ts, ti, linmuts_dict) - edited_ts = lineages.impute_lineages( - ts, ti, node_to_mut_dict, df, ohe, clf, "Nextclade_pango", internal_only - ) - edited_ts = lineages.impute_lineages( - edited_ts, ti, node_to_mut_dict, df, ohe, clf, "GISAID_lineage", internal_only - ) - return edited_ts - - def add_gisaid_lineages_to_ts(ts, node_gisaid_lineages, linmuts_dict): """ Adds lineages from GISAID to ts metadata (as 'GISAID_lineage').