From ef6b2f7dcb60a52e929011820c056cfa28ca0d76 Mon Sep 17 00:00:00 2001
From: Yan Wong <yan.wong@bdi.ox.ac.uk>
Date: Thu, 7 Nov 2024 23:39:32 +0000
Subject: [PATCH] Remove lineage imputation components

Now in the sc2ts-paper repo
---
 notebooks/lineage-imputation.ipynb | 494 -----------------------------
 sc2ts/lineages.py                  | 393 -----------------------
 sc2ts/utils.py                     |  57 ----
 3 files changed, 944 deletions(-)
 delete mode 100644 notebooks/lineage-imputation.ipynb

diff --git a/notebooks/lineage-imputation.ipynb b/notebooks/lineage-imputation.ipynb
deleted file mode 100644
index 4520e51..0000000
--- a/notebooks/lineage-imputation.ipynb
+++ /dev/null
@@ -1,494 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "498996a6",
-   "metadata": {},
-   "source": [
-    "# Imputing lineages for reconstructed internal nodes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "06dc368e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tskit\n",
-    "import tszip\n",
-    "import pandas as pd\n",
-    "import tqdm\n",
-    "\n",
-    "import sys\n",
-    "sys.path.append(\"../\")\n",
-    "import sc2ts.utils\n",
-    "import sc2ts.lineages"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "26f1cf95",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Counting descendants : 100%|███████████████████████████████████████| 783231/783231 [00:00<00:00, 3460493.66it/s]\n",
-      "Indexing metadata    : 100%|█████████████████████████████████████████| 783231/783231 [00:08<00:00, 93027.58it/s]\n",
-      "Classifying mutations: 100%|██████████████████████████████████████| 1062072/1062072 [00:07<00:00, 142610.22it/s]\n",
-      "Counting descendants : 100%|█████████████████████████████████████| 1453347/1453347 [00:00<00:00, 3336969.57it/s]\n",
-      "Indexing metadata    : 100%|███████████████████████████████████████| 1453347/1453347 [00:16<00:00, 87669.72it/s]\n",
-      "Classifying mutations: 100%|██████████████████████████████████████| 1213193/1213193 [00:08<00:00, 141461.46it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "ts_long_path = \"../../sc2ts_ts/upgma-mds-1000-md-30-mm-3-2022-06-30-recinfo\"\n",
-    "ts_wide_path = \"../../sc2ts_ts/upgma-full-md-30-mm-3-2021-06-30-recinfo\"\n",
-    "ts_long = tszip.decompress(ts_long_path + \"-il.ts.tsz\")\n",
-    "ts_wide = tszip.decompress(ts_wide_path + \"-il.ts.tsz\")\n",
-    "ti_long = sc2ts.utils.TreeInfo(ts_long)\n",
-    "ti_wide = sc2ts.utils.TreeInfo(ts_wide)\n",
-    "mutations_json_filepath = \"../../sc2ts_ts/consensus_mutations.json\"\n",
-    "gisaid_metadata_filepath = \"../../sc2ts_ts/metadata_tsv_2023_03_09/metadata.tsv\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4367b112",
-   "metadata": {},
-   "source": [
-    "# GISAID vs Nextclade lineage comparison"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "77546779",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/var/folders/6m/05k8jk1s03q36gn2syqp87m80000gs/T/ipykernel_2688/3177304272.py:1: DtypeWarning: Columns (18) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  md = pd.read_table(gisaid_metadata_filepath)\n"
-     ]
-    }
-   ],
-   "source": [
-    "md = pd.read_table(gisaid_metadata_filepath)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "c1c93bfc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gisaid_data = [(x,y) for x, y in zip(md['Accession ID'], md['Pango lineage'])]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "d3004341",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "linmuts_dict = sc2ts.lineages.read_in_mutations(mutations_json_filepath)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "cb061202",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|███████████████████████████████████████████████████████████| 15115274/15115274 [00:15<00:00, 982823.61it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ts number of samples: 657239\n",
-      "number matched to gisaid data: 657168\n",
-      "number of differences: 46311\n",
-      "proportion: 0.0704705646044847\n",
-      "Filling in missing GISAID lineages with Nextclade lineages: 185\n"
-     ]
-    }
-   ],
-   "source": [
-    "ts_long_gisaid = sc2ts.utils.check_lineages(\n",
-    "    ts_long,\n",
-    "    ti_long,\n",
-    "    gisaid_data,\n",
-    "    linmuts_dict,\n",
-    "    diff_filehandle='../../sc2ts_ts/lineage_disagreement_long',\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "1f9f99dc",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|███████████████████████████████████████████████████████████| 15115274/15115274 [00:21<00:00, 715844.46it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ts number of samples: 1265685\n",
-      "number matched to gisaid data: 1265683\n",
-      "number of differences: 65677\n",
-      "proportion: 0.05189056027457112\n",
-      "Filling in missing GISAID lineages with Nextclade lineages: 0\n"
-     ]
-    }
-   ],
-   "source": [
-    "ts_wide_gisaid = sc2ts.utils.check_lineages(\n",
-    "    ts_wide,\n",
-    "    ti_wide,\n",
-    "    gisaid_data,\n",
-    "    linmuts_dict,\n",
-    "    diff_filehandle='../../sc2ts_ts/lineage_disagreement_wide',\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bd69bb4f",
-   "metadata": {},
-   "source": [
-    "# ts lineage imputation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "3930d1da",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Recording relevant mutations for each node...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cee29a41d6a04870a7e1c5c35e153b6a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1062072 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Inferring lineages...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f323804040a74309b9fd1418b5701e94",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/781152 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "------------------------------\n",
-      "Sample nodes imputed: 657239 out of possible 657239\n",
-      "Internal nodes imputed: 123914 out of possible 123914\n",
-      "Total imputed: 781153 out of possible 781153\n",
-      "Number of recombinants (not imputed): 2078\n",
-      "------------------------------\n",
-      "Correctly imputed samples: 639658 ( 97.789 % )\n",
-      "Incorrectly imputed samples: 14460 ( 2.211 % )\n",
-      "Imputed using inheritance: 518270 ( 66.347 % ) decision tree: 262883 ( 33.653 % )\n",
-      "------------------------------\n",
-      "Time: 328.4449107646942\n",
-      "Inferring lineages...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4b29f820270c4dcbb550d2b94bced052",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/781152 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "------------------------------\n",
-      "Sample nodes imputed: 657205 out of possible 657239\n",
-      "Internal nodes imputed: 123948 out of possible 123914\n",
-      "Total imputed: 781153 out of possible 781153\n",
-      "Number of recombinants (not imputed): 2078\n",
-      "------------------------------\n",
-      "Correctly imputed samples: 634978 ( 97.084 % )\n",
-      "Incorrectly imputed samples: 19070 ( 2.916 % )\n",
-      "Imputed using inheritance: 518268 ( 66.347 % ) decision tree: 262885 ( 33.653 % )\n",
-      "------------------------------\n",
-      "Time: 355.47603726387024\n"
-     ]
-    }
-   ],
-   "source": [
-    "edited_ts_long = sc2ts.utils.lineage_imputation(\n",
-    "    mutations_json_filepath,\n",
-    "    ts_long_gisaid, \n",
-    "    ti_long,\n",
-    "    internal_only=False,\n",
-    "    verbose=False\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "23b6e9a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "edited_ts_long.dump(ts_long_path + \"-gisaid-il.ts\")\n",
-    "tszip.compress(edited_ts_long, ts_long_path + \"-gisaid-il.ts.tsz\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "f3369f8a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.9398057019493301\n"
-     ]
-    }
-   ],
-   "source": [
-    "correct = total = 0\n",
-    "for node in edited_ts_long.nodes():\n",
-    "    if 'GISAID_lineage' not in node.metadata and 'Imputed_GISAID_lineage' in node.metadata and 'Nextclade_pango' not in node.metadata and 'Imputed_Nextclade_pango' in node.metadata:\n",
-    "        if node.metadata['Imputed_GISAID_lineage'] == node.metadata['Imputed_Nextclade_pango']:\n",
-    "            correct += 1\n",
-    "        total += 1\n",
-    "print(correct/total)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "3907aca2",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Recording relevant mutations for each node...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0a4f818c20eb491aadebb2c7fdedaad3",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1213193 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Inferring lineages...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3b953a4c4f0d4a3b9204898f7b0e1cfc",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1449223 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "------------------------------\n",
-      "Sample nodes imputed: 1265685 out of possible 1265685\n",
-      "Internal nodes imputed: 183539 out of possible 183539\n",
-      "Total imputed: 1449224 out of possible 1449224\n",
-      "Number of recombinants (not imputed): 4123\n",
-      "------------------------------\n",
-      "Correctly imputed samples: 1250162 ( 99.203 % )\n",
-      "Incorrectly imputed samples: 10045 ( 0.797 % )\n",
-      "Imputed using inheritance: 1160067 ( 80.047 % ) decision tree: 289157 ( 19.953 % )\n",
-      "------------------------------\n",
-      "Time: 545.9626221656799\n",
-      "Inferring lineages...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "970bd399175146098828e2ebeeeda576",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1449223 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "------------------------------\n",
-      "Sample nodes imputed: 1265685 out of possible 1265685\n",
-      "Internal nodes imputed: 183539 out of possible 183539\n",
-      "Total imputed: 1449224 out of possible 1449224\n",
-      "Number of recombinants (not imputed): 4123\n",
-      "------------------------------\n",
-      "Correctly imputed samples: 1244789 ( 98.777 % )\n",
-      "Incorrectly imputed samples: 15416 ( 1.223 % )\n",
-      "Imputed using inheritance: 1160067 ( 80.047 % ) decision tree: 289157 ( 19.953 % )\n",
-      "------------------------------\n",
-      "Time: 561.4245040416718\n"
-     ]
-    }
-   ],
-   "source": [
-    "edited_ts_wide = sc2ts.utils.lineage_imputation(\n",
-    "    mutations_json_filepath,\n",
-    "    ts_wide_gisaid, \n",
-    "    ti_wide,\n",
-    "    internal_only=False,\n",
-    "    verbose=False\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "ac0fe662",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "edited_ts_wide.dump(ts_wide_path + \"-gisaid-il.ts\")\n",
-    "tszip.compress(edited_ts_wide, ts_wide_path + \"-gisaid-il.ts.tsz\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "d1ab97f9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.94598267097228\n"
-     ]
-    }
-   ],
-   "source": [
-    "correct = total = 0\n",
-    "for node in edited_ts_wide.nodes():\n",
-    "    if 'GISAID_lineage' not in node.metadata and 'Imputed_GISAID_lineage' in node.metadata and 'Nextclade_pango' not in node.metadata and 'Imputed_Nextclade_pango' in node.metadata:\n",
-    "        if node.metadata['Imputed_GISAID_lineage'] == node.metadata['Imputed_Nextclade_pango']:\n",
-    "            correct += 1\n",
-    "        total += 1\n",
-    "print(correct/total)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/sc2ts/lineages.py b/sc2ts/lineages.py
index bd27bd0..e56cd59 100644
--- a/sc2ts/lineages.py
+++ b/sc2ts/lineages.py
@@ -1,9 +1,6 @@
 import json
-import numpy as np
 from collections import defaultdict
-from tqdm.notebook import tqdm
 import pandas as pd
-import time
 
 
 class MutationContainer:
@@ -117,393 +114,3 @@ def read_in_mutations_json(json_filepath):
     return df, df_ohe, ohe
 
 
-def get_node_to_mut_dict(ts, ti, linmuts_dict):
-    """
-    Create dictionary of {node : [(pos, alt) of all mutations just above this node]}
-    """
-    node_to_mut_dict = MutationContainer()
-    with tqdm(total=ts.num_mutations) as pbar:
-        for m in ts.mutations():
-            pos = int(ts.site(m.site).position)
-            if pos in linmuts_dict.all_positions:
-                alt = ti.mutations_inherited_state[m.id]
-                node_to_mut_dict.add_item(m.node, pos, alt)
-            pbar.update(1)
-    return node_to_mut_dict
-
-
-class InferLineage:
-    def __init__(self, num_nodes, true_lineage):
-        self.lineages_true = [None] * num_nodes
-        self.lineages_pred = [None] * num_nodes
-        self.num_nodes = num_nodes
-        self.lineages_type = [
-            0
-        ] * num_nodes  # 0 if can't infer, 1 if inherited, 2 if imputed
-        self.num_sample_imputed = 0
-        self.num_intern_imputed = (
-            1  # This is the root node which I'm taking to be lineage B
-        )
-        self.lineages_pred[0] = "B"
-        self.lineages_type[0] = 1
-        self.change = 1
-        self.current_node = None
-        self.linfound = False
-        self.true_lineage = true_lineage
-        self.recombinants = None
-
-    def reset(self):
-        self.change = 0
-
-    def total_inferred(self, ti):
-        return self.num_sample_imputed + self.num_intern_imputed
-
-    def set_node(self, node):
-        self.current_node = node
-        self.linfound = False
-
-    def add_imputed_values(self, X_index, y):
-        for ind, pred in zip(X_index, y):
-            self.lineages_pred[ind] = pred
-            self.lineages_type[ind] = 2
-            if self.lineages_true[ind] is not None:
-                self.num_sample_imputed += 1
-            else:
-                self.num_intern_imputed += 1
-            self.change += 1
-
-    def record_recombinants(self, ts, ti):
-        for r in ti.recombinants:
-            r_node = ts.node(r)
-            if self.true_lineage not in r_node.metadata:
-                # Just recording that this is a recombinant lineage for which we don't have a Pango name
-                self.lineages_pred[r] = "Unknown"
-        self.recombinants = ti.recombinants
-
-    def record_true_lineage(self, node):
-        if self.true_lineage in node.metadata and self.lineages_true[node.id] is None:
-            self.lineages_true[node.id] = node.metadata[self.true_lineage]
-
-    def inherit_from_node(self, node, is_child=False):
-        if self.true_lineage in node.metadata:
-            self.lineages_pred[self.current_node.id] = node.metadata[self.true_lineage]
-            self.lineages_type[self.current_node.id] = 1
-            self.linfound = True
-        elif is_child and not (self.lineages_pred[node.id] in [None, "Unknown"]):
-            self.lineages_pred[self.current_node.id] = self.lineages_pred[node.id]
-            self.lineages_type[self.current_node.id] = 1
-            self.linfound = True
-        elif not is_child and self.lineages_pred[node.id] is not None:
-            self.lineages_pred[self.current_node.id] = self.lineages_pred[node.id]
-            self.lineages_type[self.current_node.id] = 1
-            self.linfound = True
-
-    def inherit_from_children(self, ts, t, mut_dict):
-        if not self.linfound:
-            for child_node_ind in t.children(self.current_node.id):
-                if child_node_ind not in mut_dict.names:
-                    child_node = ts.node(child_node_ind)
-                    self.inherit_from_node(child_node, is_child=True)
-                    if self.linfound:
-                        break
-
-    def inherit_from_parent(self, ts, t, mut_dict):
-        if not self.linfound:
-            if self.current_node.id not in mut_dict.names:
-                parent_node_ind = t.parent(self.current_node.id)
-                if parent_node_ind != -1:
-                    self.inherit_from_node(ts.node(parent_node_ind), is_child=False)
-
-    def update(self):
-        if self.linfound:
-            if self.current_node.is_sample():
-                self.num_sample_imputed += 1
-            else:
-                self.num_intern_imputed += 1
-            self.change += 1
-
-    def check_node(self, node, ti):
-        self.set_node(node)
-        if (
-            self.current_node.id not in ti.recombinants
-            and self.lineages_pred[self.current_node.id] is None
-        ):
-            return True
-        else:
-            return False
-
-    def print_info(self, ts, ti, internal_only, target):
-        print("-" * 30)
-        if internal_only:
-            target_samples = 0
-        else:
-            target_samples = ts.num_samples
-        print(
-            "Sample nodes imputed:",
-            self.num_sample_imputed,
-            "out of possible",
-            target_samples,
-        )
-        print(
-            "Internal nodes imputed:",
-            self.num_intern_imputed,
-            "out of possible",
-            target - target_samples,
-        )
-        print(
-            "Total imputed:",
-            self.num_sample_imputed + self.num_intern_imputed,
-            "out of possible",
-            target,
-        )
-        print("Number of recombinants (not imputed):", len(ti.recombinants))
-
-        print("-" * 30)
-        correct = incorrect = 0
-        type1 = type2 = 0
-        for lt, lp, ltype in zip(
-            self.lineages_true, self.lineages_pred, self.lineages_type
-        ):
-            if ltype == 1:
-                type1 += 1
-            elif ltype == 2:
-                type2 += 1
-            if not internal_only:
-                if lt is not None and lp != "Unknown":
-                    if lt == lp:
-                        correct += 1
-                    else:
-                        incorrect += 1
-        if not internal_only:
-            print(
-                "Correctly imputed samples:",
-                correct,
-                "(",
-                round(100 * correct / (correct + incorrect), 3),
-                "% )",
-            )
-            print(
-                "Incorrectly imputed samples:",
-                incorrect,
-                "(",
-                round(100 * incorrect / (correct + incorrect), 3),
-                "% )",
-            )
-        print(
-            "Imputed using inheritance:",
-            type1,
-            "(",
-            round(100 * type1 / (self.total_inferred(ti)), 3),
-            "% )",
-            "decision tree:",
-            type2,
-            "(",
-            round(100 * type2 / (self.total_inferred(ti)), 3),
-            "% )",
-        )
-        print("-" * 30)
-
-    def get_results(self):
-        all_lineages = [None] * self.num_nodes
-        for i, (lt, lp) in enumerate(zip(self.lineages_true, self.lineages_pred)):
-            if lt is not None:
-                all_lineages[i] = lt
-            elif i in self.recombinants:
-                all_lineages[i] = "Unknown (R)"
-            else:
-                all_lineages[i] = lp
-        return all_lineages
-
-
-def impute_lineages(
-    ts,
-    ti,
-    node_to_mut_dict,
-    df,
-    ohe_encoder,
-    clf_tree,
-    true_lineage="Nextclade_pango",
-    internal_only=False,
-):
-    """
-    Impute lineages for reconstructed internal nodes (if internal_only == True), or for
-    all nodes including the samples (if internal_only == False, can then calculate accuracy)
-    """
-
-    tic = time.time()
-
-    inferred_lineages = InferLineage(ts.num_nodes, true_lineage)
-    t = ts.first()
-
-    # Assigning "Unknown" as the lineage for recombinant nodes that don't have a Pango designation
-    inferred_lineages.record_recombinants(ts, ti)
-
-    for n in ts.nodes():
-        inferred_lineages.record_true_lineage(n)
-
-    if internal_only:
-        target = len(
-            [n for n in ts.nodes() if n.id not in ti.recombinants and not n.is_sample()]
-        )
-    else:
-        target = ts.num_nodes - len(ti.recombinants)
-
-    print("Inferring lineages...")
-    with tqdm(total=target - 1) as pbar:
-        while inferred_lineages.total_inferred(ti) < target:
-            impute_lineages_inheritance(
-                inferred_lineages,
-                ts,
-                t,
-                ti,
-                node_to_mut_dict,
-                internal_only,
-                pbar,
-            )
-            impute_lineages_decisiontree(
-                inferred_lineages,
-                ts,
-                t,
-                ti,
-                node_to_mut_dict,
-                df,
-                ohe_encoder,
-                clf_tree,
-                internal_only,
-                target,
-                pbar,
-            )
-            # print("Imputed so far:", inferred_lineages.num_sample_imputed + inferred_lineages.num_intern_imputed, "out of possible", target)
-    inferred_lineages.print_info(ts, ti, internal_only, target)
-
-    edited_ts = add_lineages_to_ts(inferred_lineages, ts)
-
-    print("Time:", time.time() - tic)
-
-    return edited_ts
-
-
-def impute_lineages_inheritance(
-    inferred_lineages,
-    ts,
-    t,
-    ti,
-    node_to_mut_dict,
-    internal_only,
-    pbar,
-):
-    """
-    For each node for which a lineage has not yet been assigned, try and copy the lineage of the parent or
-    one of the children (if there are no lineage-defining mutations on the connecting edge).
-    This is run iteratively on the nodes until no further assignment is possible.
-    """
-
-    # print("Inheriting lineages...", end="")
-    # Need to loop through until all known lineages have been copied where possible
-    while inferred_lineages.change:
-        inferred_lineages.reset()
-        for n_ in t.nodes(order="timedesc"):
-            n = ts.node(n_)
-            if not internal_only or (internal_only and not n.is_sample()):
-                if inferred_lineages.check_node(n, ti):
-                    # Try to inherit lineage from parent or children, if there is at least one edge
-                    # without a mutation
-                    inferred_lineages.inherit_from_children(ts, t, node_to_mut_dict)
-                    inferred_lineages.inherit_from_parent(ts, t, node_to_mut_dict)
-                    inferred_lineages.update()
-        # print(inferred_lineages.change, end="...")
-        pbar.update(inferred_lineages.change)
-    # print("done")
-
-
-def impute_lineages_decisiontree(
-    inferred_lineages,
-    ts,
-    t,
-    ti,
-    node_to_mut_dict,
-    df,
-    ohe_encoder,
-    clf_tree,
-    internal_only,
-    target,
-    pbar,
-):
-    """
-    For each node, impute a lineage based on that of the parent node (if known or already imputed) plus
-    the lineage-defining mutations on the connecting edge. This uses the decision tree constructed using
-    COVIDCG lineage-defining mutations data.
-    """
-
-    # Impute lineages for the rest of the nodes where possible (one pass)
-    X = pd.DataFrame(
-        index=range(target - inferred_lineages.total_inferred(ti)), columns=df.columns
-    )
-    X_index = np.zeros(target - inferred_lineages.total_inferred(ti), dtype=int)
-    ind = 0
-    # print("Imputing lineages...", end = "")
-    inferred_lineages.reset()
-    for n_ in t.nodes(order="timedesc"):
-        n = ts.node(n_)
-        if not internal_only or (internal_only and not n.is_sample()):
-            if inferred_lineages.check_node(n, ti):
-                parent_node_ind = t.parent(inferred_lineages.current_node.id)
-                if parent_node_ind != -1:
-                    parent_node_md = ts.node(parent_node_ind).metadata
-                    if (
-                        inferred_lineages.true_lineage in parent_node_md
-                        or inferred_lineages.lineages_pred[parent_node_ind] is not None
-                    ):
-                        # Check if we can now copy the parent's lineage
-                        if n_ not in node_to_mut_dict.names or (
-                            inferred_lineages.true_lineage not in parent_node_md
-                            and inferred_lineages.lineages_pred[parent_node_ind]
-                            == "Unknown"
-                        ):
-                            inferred_lineages.inherit_from_node(
-                                ts.node(parent_node_ind)
-                            )
-                            inferred_lineages.update()
-                        # If not, then add to dataframe for imputation
-                        else:
-                            if inferred_lineages.true_lineage in parent_node_md:
-                                parent_lineage = parent_node_md[
-                                    inferred_lineages.true_lineage
-                                ]
-                            else:
-                                parent_lineage = inferred_lineages.lineages_pred[
-                                    parent_node_ind
-                                ]
-                            X_index[ind] = n_
-                            X.loc[ind] = df.loc[parent_lineage]
-                            positions, alts = node_to_mut_dict.get_mutations(n_)
-                            X.loc[ind][positions] = alts
-                            ind += 1
-    if ind > 0:
-        X = X.iloc[0:ind]
-        X_index = X_index[0:ind]
-        y = clf_tree.predict(ohe_encoder.transform(X))
-        inferred_lineages.add_imputed_values(X_index, y)
-    pbar.update(inferred_lineages.change)
-
-
-def add_lineages_to_ts(il, ts):
-    """
-    Adds imputed lineages to ts metadata.
-    """
-    imputed_lineages = il.get_results()
-    tables = ts.tables
-    new_metadata = []
-    for node in ts.nodes():
-        md = node.metadata
-        if "Imputed_lineage" in md:
-            md.pop("Imputed_lineage")
-        md["Imputed_" + il.true_lineage] = imputed_lineages[node.id]
-        new_metadata.append(md)
-    validated_metadata = [
-        tables.nodes.metadata_schema.validate_and_encode_row(row)
-        for row in new_metadata
-    ]
-    tables.nodes.packset_metadata(validated_metadata)
-    edited_ts = tables.tree_sequence()
-    return edited_ts
diff --git a/sc2ts/utils.py b/sc2ts/utils.py
index 353da7a..1eee1c4 100644
--- a/sc2ts/utils.py
+++ b/sc2ts/utils.py
@@ -14,7 +14,6 @@
 import numpy as np
 import pandas as pd
 
-import sklearn
 import tqdm
 import matplotlib.pyplot as plt
 from matplotlib import colors
@@ -754,62 +753,6 @@ def sample_subgraph(sample_node, ts, ti=None, **kwargs):
     return plot_subgraph(nodes, ts, ti, **kwargs)
 
 
-def imputation_setup(filepath, verbose=False):
-    """
-    Reads in JSON of lineage-defining mutations and constructs decision tree classifier
-    JSON can be downloaded from covidcg.org -> 'Compare AA mutations' -> Download -> 'Consensus mutations'
-    (setting mutation type to 'NT' and consensus threshold to 0.9)
-    """
-    linmuts_dict = lineages.read_in_mutations(filepath)
-    df, df_ohe, ohe = lineages.read_in_mutations_json(filepath)
-
-    # Get decision tree
-    y = df_ohe.index  # lineage labels
-    clf = sklearn.tree.DecisionTreeClassifier()
-    clf = clf.fit(df_ohe, y)
-
-    if verbose:
-        # Check tree works and that lineages-defining mutations are unique for each lineage
-        y_pred = clf.predict(df_ohe)
-        correct = incorrect = lineage_definition_issue = 0
-        for yy, yy_pred in zip(y, y_pred):
-            if yy == yy_pred:
-                correct += 1
-            else:
-                incorrect += 1
-                if linmuts_dict.get_mutations(yy) == linmuts_dict.get_mutations(
-                    yy_pred
-                ):
-                    lineage_definition_issue += 1
-                    print(yy_pred, "same mutations as", yy)
-        print(
-            "Correct:",
-            correct,
-            "incorrect:",
-            incorrect,
-            "of which due to lineage definition ambiguity:",
-            lineage_definition_issue,
-        )
-
-    return linmuts_dict, df, df_ohe, ohe, clf
-
-
-def lineage_imputation(filepath, ts, ti, internal_only=False, verbose=False):
-    """
-    Runs lineage imputation on input ts
-    """
-    linmuts_dict, df, df_ohe, ohe, clf = imputation_setup(filepath, verbose)
-    print("Recording relevant mutations for each node...")
-    node_to_mut_dict = lineages.get_node_to_mut_dict(ts, ti, linmuts_dict)
-    edited_ts = lineages.impute_lineages(
-        ts, ti, node_to_mut_dict, df, ohe, clf, "Nextclade_pango", internal_only
-    )
-    edited_ts = lineages.impute_lineages(
-        edited_ts, ti, node_to_mut_dict, df, ohe, clf, "GISAID_lineage", internal_only
-    )
-    return edited_ts
-
-
 def add_gisaid_lineages_to_ts(ts, node_gisaid_lineages, linmuts_dict):
     """
     Adds lineages from GISAID to ts metadata (as 'GISAID_lineage').