From 920988d23b97a2e34100e3dc457baf787e9cc2e2 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Fri, 5 Jul 2024 04:36:28 +0200 Subject: [PATCH] [feature] filters - filtering problematic mutations isn't hard-coded anymore - uses a new filter mini-format - old filters converted into YAML - tests to compare with hard-coded version - document syntax in README.md --- README.md | 45 +++++++++++ filters_preprint.yaml | 53 +++++++++++++ lollipop/cli/deconvolute.py | 19 ++++- lollipop/preprocessors.py | 90 ++++++++++++++++++++++ tests/test_filters.py | 150 ++++++++++++++++++++++++++++++++++++ tests/test_integration.py | 1 + 6 files changed, 357 insertions(+), 1 deletion(-) create mode 100644 filters_preprint.yaml create mode 100644 tests/test_filters.py diff --git a/README.md b/README.md index f5f8250..2f53c98 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,8 @@ Options: -k, --deconv-config, --dec YAML Configuration of parameters for kernel deconvolution [required] + --filters YAML List of filters for removing problematic + mutations from tally -l, --loc, --location, --wwtp, --catchment NAME Name(s) of location/wastewater treatment plant/catchment area to process @@ -288,6 +290,49 @@ var_dates: ``` see [variants_dates_example.yaml](variants_dates_example.yaml). +#### Filters (optional) + +Some mutations might be problematic and need to be taken out --- e.g. +due to drop-outs in the multiplex PCR amplification, they do not show up in the data +and this could be misinterpreted by LolliPop as proof of absence of a variant. +This optional file contains a collection of filters, each filter has a list of statements +selecting entry based on value found in columns. +The general syntax of statements is: +```text +- +``` +Valid _op_ are: +- `==` on that line, the value in column __ is exactly __ + - for simple strings this can be omitted: `- proto v3` is synonymous with `- proto == v3` +- `<=` the value is less than or equal to __ +- `>=` the value is greater than or equal to __ +- `<` the value is less than __ +- `>` the value is greater than __ +- `!=` the value is **not** __ +- `in` the value is found in the list specidied in __ +- `~` the value matches the regular expression in __ + - regex can be quoted using `/` or `@` +- `!~` the vlue does **not** matche the regular expression in __ + +Any arbitrary column found in the input file can be used. + +All statements are combined with a logical `and` and matching lines are removed from the tally table. + +For example: +```yaml +# filter to remove test samples +remove_test: +- sample ~ /^Test/ + +# filter to remove an amplicon that has drop-outs +amplicon75: + - proto v3 + - date > 2021-11-20 + - pos >= 22428 + - pos <= 22785 +``` +see [example in filters_preprint.yaml](filters_preprint.yaml). + #### Running it ```bash diff --git a/filters_preprint.yaml b/filters_preprint.yaml new file mode 100644 index 0000000..743abdc --- /dev/null +++ b/filters_preprint.yaml @@ -0,0 +1,53 @@ +bad_mutations: + - proto v3 + - mutations in [ 28461G, 11201G, 26801C, -28461G, -11201G, -26801C ] + +amplicon75: + - proto v3 + - date > 2021-11-20 + - pos >= 22428 + - pos <= 22785 + +amplicon76: + - proto v3 + - date > 2021-11-20 + - pos >= 22677 + - pos <= 23028 + +amplicon77: + - proto v3 + - date > 2021-11-20 + - pos >= 22974 + - pos <= 23327 + +amplicon88: + - proto v3 + - date > 2021-11-20 + - pos >= 26277 + - pos <= 26635 + +amplicon90: + - proto v3 + - date > 2021-11-20 + - pos >= 26895 + - pos <= 27256 + +other_0: + - proto v3 + - date > 2021-11-20 + - pos == 26709 + +other_1: + - proto v3 + - date > 2021-11-20 + - pos == 27807 + +other_2: + - proto v3 + - date > 2021-11-20 + - pos == 2832 + +other_3: + - proto v3 + - date > 2021-11-20 + - pos == 10449 diff --git a/lollipop/cli/deconvolute.py b/lollipop/cli/deconvolute.py index 93c0ac2..47bce76 100755 --- a/lollipop/cli/deconvolute.py +++ b/lollipop/cli/deconvolute.py @@ -94,6 +94,15 @@ default=None, help="Name(s) of location/wastewater treatment plant/catchment area to process", ) +@click.option( + "--filters", + "-fl", + metavar="YAML", + required=False, + default=None, + type=str, + help="List of filters for removing problematic mutations from tally", +) @click.option( "--seed", "-s", @@ -109,6 +118,7 @@ def deconvolute( variants_dates, deconv_config, loc, + filters, seed, output, fmt_columns, @@ -135,6 +145,13 @@ def deconvolute( with open(deconv_config, "r") as file: deconv = yaml.load(file) + # problematic mutation filters + if filters: + with open(filters, "r") as file: + filters = yaml.load(file) + + print(f"{len(filters)} filter{ '' if len(filters) == 1 else 's' } loaded") + # data try: df_tally = pd.read_csv( @@ -287,7 +304,7 @@ def deconvolute( no_date=no_date, remove_deletions=remove_deletions, ) - preproc = preproc.filter_mutations() + preproc = preproc.filter_mutations(filters=filters) print("deconvolve all") np.random.seed(seed) diff --git a/lollipop/preprocessors.py b/lollipop/preprocessors.py index 46710f2..3702140 100644 --- a/lollipop/preprocessors.py +++ b/lollipop/preprocessors.py @@ -1,6 +1,9 @@ import pandas as pd import numpy as np +from functools import reduce +import re import sys +from pandas.api.types import is_numeric_dtype class DataPreprocesser: @@ -118,6 +121,93 @@ def general_preprocess( def filter_mutations(self, filters=None): """filter out hardcoded problematic mutations""" + if filters is None: + return self + + types = self.df_tally.dtypes + + rxprser = re.compile( + r"^ *(?:(?P" + + r"|".join(self.df_tally.columns) + + r")|(?P\w+)) *(?Pin|[<>=~!]*) *(?P['\"]?)(?P.+)(?P=qv) *$" + ) + + def apply_filter_statement(name, fs): + """parse a single statement from a filter and apply it, returning a boolean series""" + m = rxprser.search(fs) + assert m, f"Cannot parse statement <{fs}> in filter {name}" + m = m.groupdict() + + assert m[ + "col" + ], f"bad column name {m['bad']}, not in list: {self.df_tally.columns}, while parsing statement <{fs}> in filter {name}" + + # HACK handle 'date' column differently, to force datatypes + col = ( + pd.to_datetime(self.df_tally["date"]) + if "date" == m["col"] + else self.df_tally[m["col"]] + ) + val = ( + np.datetime64(m["val"]) + if "date" == m["col"] + else ( + pd.to_numeric(m["val"]) + if is_numeric_dtype(types[m["col"]]) + else m["val"] + ) + ) + + # apply operator + match m["op"]: + case "=" | "==" | "" as e: + if e == "": + assert ( + " " not in val + ), "Do not use values with space <{val}> when using no operator (implicit 'equals'). (while parsing statement <{fs}> in filter {name})" + return col == val + case "!=" | "!": + return col != val + case "<": + return col < val + case "<=" | "=<": + return col <= val + case ">=" | ">=": + return col >= val + case ">": + return col > val + case "in": + # unpack list + return col.isin( + [ + v.strip("\"' ") + for v in val.lstrip("[ ").rstrip(" ]").split(",") + ] + ) + case "~" | "=~" | "~=": + return col.str.contains( + val[1, -2] if val[0] == val[-1] in "/@" else val + ) + case "!~" | "~!": + return ~( + col.str.contains( + val[1, -2] if val[0] == val[-1] in "/@" else val + ) + ) + case _ as o: + raise ValueError( + f"unknown operator {o}, while parsing statement <{fs}> in filter {name}" + ) + + for name, fl in filters.items(): + print(f"filter {name}") + + self.df_tally = self.df_tally[ + ~reduce( + (lambda x, y: x & y), + [apply_filter_statement(name, fstatmt) for fstatmt in fl], + ) + ] # HACK completely disable filters return self diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 0000000..f5c6980 --- /dev/null +++ b/tests/test_filters.py @@ -0,0 +1,150 @@ +import pandas as pd +import numpy as np +import lollipop as ll +import ruamel.yaml +from pandas.testing import assert_frame_equal + +yaml = ruamel.yaml.YAML(typ="rt") + + +def assert_frame_NOT_equal(*args, **kwargs): + try: + assert_frame_equal(*args, **kwargs) + except AssertionError: + # frames are not equal + pass + else: + # frames are equal + raise AssertionError + + +def hardcoded_filter(df_tally): + """this is the old hard-coded filter""" + df_tally = df_tally[ + ~df_tally["mutations"].isin( + ["28461G", "11201G", "26801C"] + ["-28461G", "-11201G", "-26801C"] + ) + ] + + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos >= 22428) + & (df_tally.pos <= 22785) + ) + ] # amplicon75 + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos >= 22677) + & (df_tally.pos <= 23028) + ) + ] # amplicon76 + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos >= 22974) + & (df_tally.pos <= 23327) + ) + ] # amplicon77 + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos >= 26277) + & (df_tally.pos <= 26635) + ) + ] # amplicon88 + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos >= 26895) + & (df_tally.pos <= 27256) + ) + ] # amplicon90 + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos == 26709) + ) + ] # other + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos == 27807) + ) + ] # other + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos == 2832) + ) + ] # other + df_tally = df_tally[ + ~( + (pd.to_datetime(df_tally.date) > np.datetime64("2021-11-20")) + & (df_tally.pos == 10449) + ) + ] # other + + return df_tally + + +def test_filters(): + tally_file = "preprint/data/tallymut_line_full.tsv.zst" + varconf_file = "config_preprint.yaml" + filter_file = "filters_preprint.yaml" + + # filter to test + with open(filter_file, "r") as f: + filters = yaml.load(f) + + # load config + with open(varconf_file, "r") as f: + conf_yaml = yaml.load(f) + variants_list = conf_yaml["variants_list"] + variants_pangolin = conf_yaml["variants_pangolin"] + variants_not_reported = conf_yaml.get("variants_not_reported", []) + to_drop = conf_yaml.get("to_drop", []) + start_date = conf_yaml.get("start_date", None) + end_date = conf_yaml.get("end_date", None) + no_date = conf_yaml.get("no_date", False) + remove_deletions = conf_yaml.get("remove_deletions", True) + + # load data + df_tally = pd.read_csv( + tally_file, sep="\t", parse_dates=["date"], dtype={"location_code": "str"} + ) + + # pre-process data + preproc = ll.DataPreprocesser(df_tally) + preproc = preproc.general_preprocess( + variants_list=variants_list, + variants_pangolin=variants_pangolin, + variants_not_reported=variants_not_reported, + to_drop=to_drop, + start_date=start_date, + end_date=end_date, + no_date=no_date, + remove_deletions=remove_deletions, + ) + + # keep copy of original + df_unfil = preproc.df_tally + + # old filtering + df_hc = hardcoded_filter(df_unfil) + + assert_frame_NOT_equal(df_unfil, df_hc) + + # modern filtering + preproc = preproc.filter_mutations(filters=filters) + + df_filt = preproc.df_tally + + assert_frame_NOT_equal(df_unfil, df_filt) + + # compare + assert_frame_equal(df_filt[df_filt["proto"] == "v3"], df_hc[df_hc["proto"] == "v3"]) + assert_frame_equal( + df_filt[df_filt["proto"] != "v3"], df_unfil[df_unfil["proto"] != "v3"] + ) diff --git a/tests/test_integration.py b/tests/test_integration.py index 99a7efc..9bbdd99 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -17,6 +17,7 @@ def test_workflow(): "--fmt-columns", "--variants-config=config_preprint.yaml", "--deconv-config=presets/deconv_linear.yaml", + "--filters=filters_preprint.yaml", "--location=Zürich (ZH)", "--seed=42", "preprint/data/tallymut_line_full.tsv.zst",