diff --git a/README.md b/README.md index d03bb96..1c50635 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Options: ``` ```console -$ lollipop deconvolute --help +$ lollipop deconvolute --help Usage: lollipop deconvolute [OPTIONS] TALLY_TSV Deconvolution for Wastewater Genomics @@ -134,12 +134,16 @@ Options: -k, --deconv-config, --dec YAML Configuration of parameters for kernel deconvolution [required] - --filters YAML List of filters for removing problematic - mutations from tally -l, --loc, --location, --wwtp, --catchment NAME Name(s) of location/wastewater treatment plant/catchment area to process + -fl, --filters YAML List of filters for removing problematic + mutations from tally -s, --seed SEED Seed the random generator + -nf, --namefield COLUMN column to use as 'names' for the entries in + tally table. By default, if 'pos' and 'base' + exist a column 'mutations' will be created + and used as name. -h, --help Show this message and exit. ``` diff --git a/lollipop/cli/deconvolute.py b/lollipop/cli/deconvolute.py index 5c94fe4..638522f 100755 --- a/lollipop/cli/deconvolute.py +++ b/lollipop/cli/deconvolute.py @@ -112,6 +112,15 @@ type=int, help="Seed the random generator", ) +@click.option( + "--namefield", + "-nf", + metavar="COLUMN", + required=False, + default="mutations", + type=str, + help="column to use as 'names' for the entries in tally table. By default, if 'pos' and 'base' exist a column 'mutations' will be created and used as name.", +) @click.argument("tally_data", metavar="TALLY_TSV", nargs=1) def deconvolute( variants_config, @@ -124,6 +133,7 @@ def deconvolute( fmt_columns, out_json, tally_data, + namefield, ): # load data yaml = ruamel.yaml.YAML(typ="rt") @@ -157,7 +167,10 @@ def deconvolute( # data try: df_tally = pd.read_csv( - tally_data, sep="\t", parse_dates=["date"], dtype={"location_code": "str"} + tally_data, + sep="\t", + parse_dates=["date"], + dtype={"location_code": "str", namefield: "str"}, ) except ValueError: df_tally = pd.read_csv(tally_data, sep="\t", dtype={"location_code": "str"}) @@ -305,6 +318,7 @@ def deconvolute( end_date=end_date, no_date=no_date, remove_deletions=remove_deletions, + namefield=namefield, ) preproc = preproc.filter_mutations(filters=filters) @@ -365,7 +379,12 @@ def deconvolute( ): if bootstrap > 1: # resample if we're doing bootstrapping - temp_dfb = ll.resample_mutations(loc_df, loc_df.mutations.unique())[0] + assert ( + namefield in loc_df.columns + ), f"bootstrapping needs a column with names for the entries of the tally table, but no column '{namefield}' found. Use option '--namefield' to specify" + temp_dfb = ll.resample_mutations( + loc_df, loc_df[namefield].unique(), namefield + )[0] else: # just run one on everything temp_dfb = loc_df diff --git a/lollipop/confints.py b/lollipop/confints.py index a43f82e..4e2b979 100644 --- a/lollipop/confints.py +++ b/lollipop/confints.py @@ -149,7 +149,7 @@ def confint(self, X, coefs, y=None, kvals=None): } -def resample_mutations(df_city1, mutations): +def resample_mutations(df_city1, mutations, namefield="mutations"): """ Function to resample mutations by replacement (preserving mutation-complement pairs). Returns a copy of the DataFrame with column indicating how many times the mutation was in the resample. @@ -167,6 +167,6 @@ def resample_mutations(df_city1, mutations): ) # make a column with coefficients for how many times a row should be accounted for according to the resample df_sampled = df_city1.copy() - df_sampled.loc[:, "resample_value"] = df_sampled.mutations.map(resample_coeff_dict) + df_sampled.loc[:, "resample_value"] = df_sampled[namefield].map(resample_coeff_dict) return df_sampled, rand_idcs diff --git a/lollipop/preprocessors.py b/lollipop/preprocessors.py index 4a3de7d..2fa7620 100644 --- a/lollipop/preprocessors.py +++ b/lollipop/preprocessors.py @@ -11,13 +11,14 @@ class DataPreprocesser: General class to preprocess tallymut data before deconvolution. """ - def __init__(self, df_tally): + def __init__(self, df_tally, namefield="mutations"): self.df_tally = df_tally - def make_complement(self, df_tally, variants_list): + def make_complement(self, df_tally, variants_list, namefield): """return a dataframe with the complement of mutations signatures and mutations fracs""" t_data = df_tally.copy() - t_data["mutations"] = "-" + t_data["mutations"] + if namefield in t_data.columns: + t_data[namefield] = "-" + t_data[namefield] t_data["frac"] = 1 - t_data["frac"] t_data[variants_list] = 1 - t_data[variants_list] t_data["undetermined"] = 1 @@ -35,6 +36,7 @@ def general_preprocess( no_date=False, remove_deletions=True, make_complement=True, + namefield="mutations", ): """General preprocessing steps""" # rename columns @@ -52,7 +54,7 @@ def general_preprocess( ) # create column with mutation signature if ("base" in self.df_tally.columns) and ("pos" in self.df_tally.columns): - # NOTE if cojac-based instead of SNV-bsed deconvolution: there is no single mutation + # NOTE if cojac-based instead of SNV-bsed deconvolution: there is no single mutation cojac tabmut has an option to generate mutations strings self.df_tally["mutations"] = ( self.df_tally["pos"].astype(str) + self.df_tally["base"] ) @@ -114,7 +116,10 @@ def general_preprocess( self.df_tally.insert(self.df_tally.columns.size - 1, "undetermined", 0) if make_complement: self.df_tally = pd.concat( - [self.df_tally, self.make_complement(self.df_tally, variants_columns)] + [ + self.df_tally, + self.make_complement(self.df_tally, variants_columns, namefield), + ] ) return self