Skip to content

Commit

Permalink
Support alternative column for names
Browse files Browse the repository at this point in the history
- parameter to set which column contains the name of entries in the
  tally table (if no "mutations", e.g., when not using SNVs)
- Bootstrapping method has a hard requirement on such names
  • Loading branch information
DrYak committed Oct 9, 2024
1 parent 7d77300 commit 6607fab
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 9 deletions.
23 changes: 21 additions & 2 deletions lollipop/cli/deconvolute.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@
type=int,
help="Seed the random generator",
)
@click.option(
"--namefield",
"-nf",
metavar="COLUMN",
required=False,
default="mutations",
type=str,
help="column to use as 'names' for the entries in tally table. By default, if 'pos' and 'base' exist a column 'mutations' will be created and used as name.",
)
@click.argument("tally_data", metavar="TALLY_TSV", nargs=1)
def deconvolute(
variants_config,
Expand All @@ -124,6 +133,7 @@ def deconvolute(
fmt_columns,
out_json,
tally_data,
namefield,
):
# load data
yaml = ruamel.yaml.YAML(typ="rt")
Expand Down Expand Up @@ -157,7 +167,10 @@ def deconvolute(
# data
try:
df_tally = pd.read_csv(
tally_data, sep="\t", parse_dates=["date"], dtype={"location_code": "str"}
tally_data,
sep="\t",
parse_dates=["date"],
dtype={"location_code": "str", namefield: "str"},
)
except ValueError:
df_tally = pd.read_csv(tally_data, sep="\t", dtype={"location_code": "str"})
Expand Down Expand Up @@ -305,6 +318,7 @@ def deconvolute(
end_date=end_date,
no_date=no_date,
remove_deletions=remove_deletions,
namefield=namefield,
)
preproc = preproc.filter_mutations(filters=filters)

Expand Down Expand Up @@ -365,7 +379,12 @@ def deconvolute(
):
if bootstrap > 1:
# resample if we're doing bootstrapping
temp_dfb = ll.resample_mutations(loc_df, loc_df.mutations.unique())[0]
assert (
namefield in loc_df.columns
), f"bootstrapping needs a column with names for the entries of the tally table, but no column '{namefield}' found. Use option '--namefield' to specify"
temp_dfb = ll.resample_mutations(
loc_df, loc_df[namefield].unique(), namefield
)[0]
else:
# just run one on everything
temp_dfb = loc_df
Expand Down
4 changes: 2 additions & 2 deletions lollipop/confints.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def confint(self, X, coefs, y=None, kvals=None):
}


def resample_mutations(df_city1, mutations):
def resample_mutations(df_city1, mutations, namefield="mutations"):
"""
Function to resample mutations by replacement (preserving mutation-complement pairs).
Returns a copy of the DataFrame with <resample_value> column indicating how many times the mutation was in the resample.
Expand All @@ -167,6 +167,6 @@ def resample_mutations(df_city1, mutations):
)
# make a column with coefficients for how many times a row should be accounted for according to the resample
df_sampled = df_city1.copy()
df_sampled.loc[:, "resample_value"] = df_sampled.mutations.map(resample_coeff_dict)
df_sampled.loc[:, "resample_value"] = df_sampled[namefield].map(resample_coeff_dict)

return df_sampled, rand_idcs
15 changes: 10 additions & 5 deletions lollipop/preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ class DataPreprocesser:
General class to preprocess tallymut data before deconvolution.
"""

def __init__(self, df_tally):
def __init__(self, df_tally, namefield="mutations"):
self.df_tally = df_tally

def make_complement(self, df_tally, variants_list):
def make_complement(self, df_tally, variants_list, namefield):
"""return a dataframe with the complement of mutations signatures and mutations fracs"""
t_data = df_tally.copy()
t_data["mutations"] = "-" + t_data["mutations"]
if namefield in t_data.columns:
t_data[namefield] = "-" + t_data[namefield]
t_data["frac"] = 1 - t_data["frac"]
t_data[variants_list] = 1 - t_data[variants_list]
t_data["undetermined"] = 1
Expand All @@ -35,6 +36,7 @@ def general_preprocess(
no_date=False,
remove_deletions=True,
make_complement=True,
namefield="mutations",
):
"""General preprocessing steps"""
# rename columns
Expand All @@ -52,7 +54,7 @@ def general_preprocess(
)
# create column with mutation signature
if ("base" in self.df_tally.columns) and ("pos" in self.df_tally.columns):
# NOTE if cojac-based instead of SNV-bsed deconvolution: there is no single mutation
# NOTE if cojac-based instead of SNV-bsed deconvolution: there is no single mutation cojac tabmut has an option to generate mutations strings
self.df_tally["mutations"] = (
self.df_tally["pos"].astype(str) + self.df_tally["base"]
)
Expand Down Expand Up @@ -114,7 +116,10 @@ def general_preprocess(
self.df_tally.insert(self.df_tally.columns.size - 1, "undetermined", 0)
if make_complement:
self.df_tally = pd.concat(
[self.df_tally, self.make_complement(self.df_tally, variants_columns)]
[
self.df_tally,
self.make_complement(self.df_tally, variants_columns, namefield),
]
)

return self
Expand Down

0 comments on commit 6607fab

Please sign in to comment.