Support alternative column for names

- parameter to set which column contains the name of entries in the tally table (if no "mutations", e.g., when not using SNVs) - Bootstrapping method has a hard requirement on such names
cbg-ethz · Oct 9, 2024 · 6607fab · 6607fab
1 parent 7d77300
commit 6607fab
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 9 deletions.
diff --git a/lollipop/cli/deconvolute.py b/lollipop/cli/deconvolute.py
@@ -112,6 +112,15 @@
     type=int,
     help="Seed the random generator",
 )
+@click.option(
+    "--namefield",
+    "-nf",
+    metavar="COLUMN",
+    required=False,
+    default="mutations",
+    type=str,
+    help="column to use as 'names' for the entries in tally table. By default, if 'pos' and 'base' exist a column 'mutations' will be created and used as name.",
+)
 @click.argument("tally_data", metavar="TALLY_TSV", nargs=1)
 def deconvolute(
     variants_config,
@@ -124,6 +133,7 @@ def deconvolute(
     fmt_columns,
     out_json,
     tally_data,
+    namefield,
 ):
     # load data
     yaml = ruamel.yaml.YAML(typ="rt")
@@ -157,7 +167,10 @@ def deconvolute(
     # data
     try:
         df_tally = pd.read_csv(
-            tally_data, sep="\t", parse_dates=["date"], dtype={"location_code": "str"}
+            tally_data,
+            sep="\t",
+            parse_dates=["date"],
+            dtype={"location_code": "str", namefield: "str"},
         )
     except ValueError:
         df_tally = pd.read_csv(tally_data, sep="\t", dtype={"location_code": "str"})
@@ -305,6 +318,7 @@ def deconvolute(
         end_date=end_date,
         no_date=no_date,
         remove_deletions=remove_deletions,
+        namefield=namefield,
     )
     preproc = preproc.filter_mutations(filters=filters)
 
@@ -365,7 +379,12 @@ def deconvolute(
         ):
             if bootstrap > 1:
                 # resample if we're doing bootstrapping
-                temp_dfb = ll.resample_mutations(loc_df, loc_df.mutations.unique())[0]
+                assert (
+                    namefield in loc_df.columns
+                ), f"bootstrapping needs a column with names for the entries of the tally table, but no column '{namefield}' found. Use option '--namefield' to specify"
+                temp_dfb = ll.resample_mutations(
+                    loc_df, loc_df[namefield].unique(), namefield
+                )[0]
             else:
                 # just run one on everything
                 temp_dfb = loc_df

diff --git a/lollipop/confints.py b/lollipop/confints.py
@@ -149,7 +149,7 @@ def confint(self, X, coefs, y=None, kvals=None):
             }
 
 
-def resample_mutations(df_city1, mutations):
+def resample_mutations(df_city1, mutations, namefield="mutations"):
     """
     Function to resample mutations by replacement (preserving mutation-complement pairs).
     Returns a copy of the DataFrame with <resample_value> column indicating how many times the mutation was in the resample.
@@ -167,6 +167,6 @@ def resample_mutations(df_city1, mutations):
     )
     # make a column with coefficients for how many times a row should be accounted for according to the resample
     df_sampled = df_city1.copy()
-    df_sampled.loc[:, "resample_value"] = df_sampled.mutations.map(resample_coeff_dict)
+    df_sampled.loc[:, "resample_value"] = df_sampled[namefield].map(resample_coeff_dict)
 
     return df_sampled, rand_idcs
diff --git a/lollipop/preprocessors.py b/lollipop/preprocessors.py
@@ -11,13 +11,14 @@ class DataPreprocesser:
     General class to preprocess tallymut data before deconvolution.
     """
 
-    def __init__(self, df_tally):
+    def __init__(self, df_tally, namefield="mutations"):
         self.df_tally = df_tally
 
-    def make_complement(self, df_tally, variants_list):
+    def make_complement(self, df_tally, variants_list, namefield):
         """return a dataframe with the complement of mutations signatures and mutations fracs"""
         t_data = df_tally.copy()
-        t_data["mutations"] = "-" + t_data["mutations"]
+        if namefield in t_data.columns:
+            t_data[namefield] = "-" + t_data[namefield]
         t_data["frac"] = 1 - t_data["frac"]
         t_data[variants_list] = 1 - t_data[variants_list]
         t_data["undetermined"] = 1
@@ -35,6 +36,7 @@ def general_preprocess(
         no_date=False,
         remove_deletions=True,
         make_complement=True,
+        namefield="mutations",
     ):
         """General preprocessing steps"""
         # rename columns
@@ -52,7 +54,7 @@ def general_preprocess(
         )
         # create column with mutation signature
         if ("base" in self.df_tally.columns) and ("pos" in self.df_tally.columns):
-            # NOTE if cojac-based instead of SNV-bsed deconvolution: there is no single mutation
+            # NOTE if cojac-based instead of SNV-bsed deconvolution: there is no single mutation cojac tabmut has an option to generate mutations strings
             self.df_tally["mutations"] = (
                 self.df_tally["pos"].astype(str) + self.df_tally["base"]
             )
@@ -114,7 +116,10 @@ def general_preprocess(
         self.df_tally.insert(self.df_tally.columns.size - 1, "undetermined", 0)
         if make_complement:
             self.df_tally = pd.concat(
-                [self.df_tally, self.make_complement(self.df_tally, variants_columns)]
+                [
+                    self.df_tally,
+                    self.make_complement(self.df_tally, variants_columns, namefield),
+                ]
             )
 
         return self