[feature] filters

- filtering problematic mutations isn't hard-coded anymore - uses a new filter mini-format - old filters converted into YAML - tests to compare with hard-coded version - document syntax in README.md
cbg-ethz · Jul 5, 2024 · 920988d · 920988d
1 parent 051ea4c
commit 920988d
Show file tree

Hide file tree

Showing 6 changed files with 357 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -134,6 +134,8 @@ Options:
   -k, --deconv-config, --dec YAML
                                   Configuration of parameters for kernel
                                   deconvolution  [required]
+  --filters YAML                  List of filters for removing problematic
+                                  mutations from tally
   -l, --loc, --location, --wwtp, --catchment NAME
                                   Name(s) of location/wastewater treatment
                                   plant/catchment area to process
@@ -288,6 +290,49 @@ var_dates:
 ```
 see [variants_dates_example.yaml](variants_dates_example.yaml).
 
+#### Filters (optional)
+
+Some mutations might be problematic and need to be taken out --- e.g. 
+due to drop-outs in the multiplex PCR amplification, they do not show up in the data
+and this could be misinterpreted by LolliPop as proof of absence of a variant.
+This optional file contains a collection of filters, each filter has a list of statements 
+selecting entry based on value found in columns.
+The general syntax of statements is:
+```text
+- <column> <op> <value>
+```
+Valid _op_ are:
+- `==` on that line, the value in column _<column>_ is exactly _<value>_
+  - for simple strings this can be omitted: `- proto v3` is synonymous with `- proto == v3`
+- `<=` the value is less than or equal to _<value>_
+- `>=` the value is greater than or equal to _<value>_
+- `<` the value is less than _<value>_
+- `>` the value is greater than _<value>_
+- `!=` the value is **not** _<value>_
+- `in` the value is found in the list specidied in _<value>_
+- `~` the value matches the regular expression in _<value>_
+  - regex can be quoted using `/` or `@`
+- `!~` the vlue does **not** matche the regular expression in _<value>_
+
+Any arbitrary column found in the input file can be used.
+
+All statements are combined with a logical `and` and matching lines are removed from the tally table.
+
+For example:
+```yaml
+# filter to remove test samples
+remove_test:
+- sample ~ /^Test/
+
+# filter to remove an amplicon that has drop-outs
+amplicon75:
+  - proto v3
+  - date > 2021-11-20
+  - pos >= 22428
+  - pos <= 22785
+```
+see [example in filters_preprint.yaml](filters_preprint.yaml).
+
 #### Running it
 
 ```bash

diff --git a/filters_preprint.yaml b/filters_preprint.yaml
@@ -0,0 +1,53 @@
+bad_mutations: 
+  - proto v3
+  - mutations in [ 28461G, 11201G, 26801C, -28461G, -11201G, -26801C ]
+
+amplicon75:
+  - proto v3
+  - date > 2021-11-20
+  - pos >= 22428
+  - pos <= 22785
+
+amplicon76:
+  - proto v3
+  - date > 2021-11-20
+  - pos >= 22677
+  - pos <= 23028
+
+amplicon77:
+  - proto v3
+  - date > 2021-11-20
+  - pos >= 22974
+  - pos <= 23327
+
+amplicon88:
+  - proto v3
+  - date > 2021-11-20
+  - pos >= 26277
+  - pos <= 26635
+
+amplicon90:
+  - proto v3
+  - date > 2021-11-20
+  - pos >= 26895
+  - pos <= 27256
+
+other_0:
+  - proto v3
+  - date > 2021-11-20
+  - pos == 26709
+
+other_1:
+  - proto v3
+  - date > 2021-11-20
+  - pos == 27807
+
+other_2:
+  - proto v3
+  - date > 2021-11-20
+  - pos == 2832
+
+other_3:
+  - proto v3
+  - date > 2021-11-20
+  - pos == 10449
diff --git a/lollipop/cli/deconvolute.py b/lollipop/cli/deconvolute.py
@@ -94,6 +94,15 @@
     default=None,
     help="Name(s) of location/wastewater treatment plant/catchment area to process",
 )
+@click.option(
+    "--filters",
+    "-fl",
+    metavar="YAML",
+    required=False,
+    default=None,
+    type=str,
+    help="List of filters for removing problematic mutations from tally",
+)
 @click.option(
     "--seed",
     "-s",
@@ -109,6 +118,7 @@ def deconvolute(
     variants_dates,
     deconv_config,
     loc,
+    filters,
     seed,
     output,
     fmt_columns,
@@ -135,6 +145,13 @@ def deconvolute(
     with open(deconv_config, "r") as file:
         deconv = yaml.load(file)
 
+    # problematic mutation filters
+    if filters:
+        with open(filters, "r") as file:
+            filters = yaml.load(file)
+
+        print(f"{len(filters)} filter{ '' if len(filters) == 1 else 's' } loaded")
+
     # data
     try:
         df_tally = pd.read_csv(
@@ -287,7 +304,7 @@ def deconvolute(
         no_date=no_date,
         remove_deletions=remove_deletions,
     )
-    preproc = preproc.filter_mutations()
+    preproc = preproc.filter_mutations(filters=filters)
 
     print("deconvolve all")
     np.random.seed(seed)

diff --git a/lollipop/preprocessors.py b/lollipop/preprocessors.py
@@ -1,6 +1,9 @@
 import pandas as pd
 import numpy as np
+from functools import reduce
+import re
 import sys
+from pandas.api.types import is_numeric_dtype
 
 
 class DataPreprocesser:
@@ -118,6 +121,93 @@ def general_preprocess(
 
     def filter_mutations(self, filters=None):
         """filter out hardcoded problematic mutations"""
+        if filters is None:
+            return self
+
+        types = self.df_tally.dtypes
+
+        rxprser = re.compile(
+            r"^ *(?:(?P<col>"
+            + r"|".join(self.df_tally.columns)
+            + r")|(?P<bad>\w+)) *(?P<op>in|[<>=~!]*) *(?P<qv>['\"]?)(?P<val>.+)(?P=qv) *$"
+        )
+
+        def apply_filter_statement(name, fs):
+            """parse a single statement from a filter and apply it, returning a boolean series"""
+            m = rxprser.search(fs)
+            assert m, f"Cannot parse statement <{fs}> in filter {name}"
+            m = m.groupdict()
+
+            assert m[
+                "col"
+            ], f"bad column name {m['bad']}, not in list: {self.df_tally.columns}, while parsing statement <{fs}> in filter {name}"
+
+            # HACK handle 'date' column differently, to force datatypes
+            col = (
+                pd.to_datetime(self.df_tally["date"])
+                if "date" == m["col"]
+                else self.df_tally[m["col"]]
+            )
+            val = (
+                np.datetime64(m["val"])
+                if "date" == m["col"]
+                else (
+                    pd.to_numeric(m["val"])
+                    if is_numeric_dtype(types[m["col"]])
+                    else m["val"]
+                )
+            )
+
+            # apply operator
+            match m["op"]:
+                case "=" | "==" | "" as e:
+                    if e == "":
+                        assert (
+                            " " not in val
+                        ), "Do not use values with space <{val}> when using no operator (implicit 'equals'). (while parsing statement <{fs}> in filter {name})"
+                    return col == val
+                case "!=" | "!":
+                    return col != val
+                case "<":
+                    return col < val
+                case "<=" | "=<":
+                    return col <= val
+                case ">=" | ">=":
+                    return col >= val
+                case ">":
+                    return col > val
+                case "in":
+                    # unpack list
+                    return col.isin(
+                        [
+                            v.strip("\"' ")
+                            for v in val.lstrip("[ ").rstrip(" ]").split(",")
+                        ]
+                    )
+                case "~" | "=~" | "~=":
+                    return col.str.contains(
+                        val[1, -2] if val[0] == val[-1] in "/@" else val
+                    )
+                case "!~" | "~!":
+                    return ~(
+                        col.str.contains(
+                            val[1, -2] if val[0] == val[-1] in "/@" else val
+                        )
+                    )
+                case _ as o:
+                    raise ValueError(
+                        f"unknown operator {o}, while parsing statement <{fs}> in filter {name}"
+                    )
+
+        for name, fl in filters.items():
+            print(f"filter {name}")
+
+            self.df_tally = self.df_tally[
+                ~reduce(
+                    (lambda x, y: x & y),
+                    [apply_filter_statement(name, fstatmt) for fstatmt in fl],
+                )
+            ]
 
         # HACK completely disable filters
         return self