From ee2da328ee8d40b0270306015b8814ccaddd73e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Wed, 29 May 2019 17:53:35 -0400
Subject: [PATCH 1/6] Sync header

---
 pipeline/tests/predict.tsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/tests/predict.tsv b/pipeline/tests/predict.tsv
index de5d22d..f66d9ef 100644
--- a/pipeline/tests/predict.tsv
+++ b/pipeline/tests/predict.tsv
@@ -1,4 +1,4 @@
-datetime	reads	kbps	kkmers	kmers_prop	pgs	pgs_pass	pg	alt_pg	bm	bm_ST	bm_serotype	pen_sus	pen_pred	pen_bm	cro_sus	cro_pred	cro_bm	tmp_sus	tmp_pred	tmp_bm	ery_sus	ery_pred	ery_bm	tet_sus	tet_pred	tet_bm	flags
+datetime	reads	kbps	kkmers	kmers_prop	pgs	pgs_pass	pg	alt_pg	bm	bm_ST	bm_serotype	pen_ssc	pen_pred	pen_bm	cro_ssc	cro_pred	cro_bm	tmp_ssc	tmp_pred	tmp_bm	ery_ssc	ery_pred	ery_bm	tet_ssc	tet_pred	tet_bm	flags
 2017-05-26 17:29:52	0	0	0	0.000	0	0	NA	NA	NA	NA	NA	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	[]
 2017-05-26 17:30:52	137	319	19	0.060	0.608	1	3	16	8QTW4	63	15A	0.141	R	R (0.25)	0.813	S	S (0.094)	0.659	S!	S (0.19)	0.133	R	R (32)	0.141	R	r (NA)	[D:bm, D:pg, S:bm, S:pg]
 2017-05-26 17:31:52	278	646	39	0.061	0.590	1	3	2	8QTW4	63	15A	0.125	R	R (0.25)	0.844	S	S (0.094)	0.746	S!	S (0.19)	0.312	R!	R (32)	0.312	R!	r (NA)	[]

From 5678e5d616d38575365b7a8203b4b910f62d54e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Wed, 29 May 2019 17:53:51 -0400
Subject: [PATCH 2/6] Add comments

---
 src/rase/rase_prediction_add_flags.py | 43 ++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/rase/rase_prediction_add_flags.py b/src/rase/rase_prediction_add_flags.py
index 66661f6..8747283 100755
--- a/src/rase/rase_prediction_add_flags.py
+++ b/src/rase/rase_prediction_add_flags.py
@@ -17,13 +17,24 @@
 import re
 
 
+def load_tsv_dl(tsv_fn):
+    """Load TSV as a list of dictionaries (1 dict per 1 line).
+    """
+    with open(tsv_fn) as f:
+        tr = [x for x in csv.DictReader(f, delimiter="\t")]
+    return tr
+
+
 def get_cols(tsv_dl):
+    """Get columns from a list of dictionaries.
+    """
     d = tsv_dl[0]
     return [x for x in d.keys()]
 
 
 def extract_flag_cols(cols):
-    # regular expressions for categories to be flagged
+    """Extract columns for which flags are to be computed.
+    """
     res_flagging = [
         re.compile(r"^bm$"),
         re.compile(r"^serotype$"),
@@ -41,14 +52,11 @@ def extract_flag_cols(cols):
     return flag_cols
 
 
-def load_tsv_dl(tsv_fn):
-    with open(tsv_fn) as f:
-        tr = [x for x in csv.DictReader(f, delimiter="\t")]
-    return tr
-
-
 def get_stabilization_point(tsv_dl, key):
-    tsv_dl_ = [collections.defaultdict(lambda: "")] + tsv_dl
+    """Find points of stabilization.
+    """
+    tsv_dl_ = [collections.defaultdict(lambda: "")
+               ] + tsv_dl  # probably to support empty files?
     final_value = tsv_dl_[-1][key]
     for i in range(len(tsv_dl_) - 1, -1, -1):
         if tsv_dl_[i][key] != final_value:
@@ -56,7 +64,26 @@ def get_stabilization_point(tsv_dl, key):
     return i
 
 
+def add_flags_line():
+    for k in flag_cols:
+        if i == stabilization_points[k]:
+            flags.append('S:{}'.format(k))
+            assert rec[k] != prev_rec[k]
+        if rec[k] != prev_rec[k]:
+            if rec[k] == last_rec[k]:
+                # detected
+                flags.append('D:{}'.format(k))
+            elif prev_rec[k] == last_rec[k]:
+                # lost successfuly detected
+                flags.append('L:{}'.format(k))
+    flags.sort()
+    flags_str = str(flags).replace("'", "")
+    print(*rec.values(), flags_str, sep="\t")
+
+
 def add_flags(tsv_fn):
+    """Read RASE prediction output and add flags.
+    """
     tsv_dl = load_tsv_dl(tsv_fn)
     cols = get_cols(tsv_dl)
     flag_cols = extract_flag_cols(cols)

From 8f08c6e612cb7e547b44f2a0e0d97c7007e52dee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Wed, 29 May 2019 18:04:25 -0400
Subject: [PATCH 3/6] Make the test more interesting

---
 pipeline/tests/predict.tsv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipeline/tests/predict.tsv b/pipeline/tests/predict.tsv
index f66d9ef..7fec93d 100644
--- a/pipeline/tests/predict.tsv
+++ b/pipeline/tests/predict.tsv
@@ -1,5 +1,5 @@
 datetime	reads	kbps	kkmers	kmers_prop	pgs	pgs_pass	pg	alt_pg	bm	bm_ST	bm_serotype	pen_ssc	pen_pred	pen_bm	cro_ssc	cro_pred	cro_bm	tmp_ssc	tmp_pred	tmp_bm	ery_ssc	ery_pred	ery_bm	tet_ssc	tet_pred	tet_bm	flags
-2017-05-26 17:29:52	0	0	0	0.000	0	0	NA	NA	NA	NA	NA	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	[]
+2017-05-26 17:29:52	0	0	0	0.000	0	0	NA	NA	NA	NA	NA	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	R	NA (NA)	0.000	S	NA (NA)	[]
 2017-05-26 17:30:52	137	319	19	0.060	0.608	1	3	16	8QTW4	63	15A	0.141	R	R (0.25)	0.813	S	S (0.094)	0.659	S!	S (0.19)	0.133	R	R (32)	0.141	R	r (NA)	[D:bm, D:pg, S:bm, S:pg]
 2017-05-26 17:31:52	278	646	39	0.061	0.590	1	3	2	8QTW4	63	15A	0.125	R	R (0.25)	0.844	S	S (0.094)	0.746	S!	S (0.19)	0.312	R!	R (32)	0.312	R!	r (NA)	[]
 2017-05-26 17:32:53	414	973	59	0.060	0.672	1	3	2	8QTW4	63	15A	0.135	R	R (0.25)	0.824	S	S (0.094)	0.754	S!	S (0.19)	0.276	R!	R (32)	0.276	R!	r (NA)	[]

From 267804ea97e616083be3569b987a7709507a8309 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Wed, 29 May 2019 18:57:16 -0400
Subject: [PATCH 4/6] Add functions for finding detection, stabilization and
 losing points

---
 scripts/rase_plot_timeline.R | 65 ++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/scripts/rase_plot_timeline.R b/scripts/rase_plot_timeline.R
index adfda0a..7100b66 100755
--- a/scripts/rase_plot_timeline.R
+++ b/scripts/rase_plot_timeline.R
@@ -155,6 +155,71 @@ DfToAnts <- function(df) {
 }
 
 
+#' Return list marking final values of resistance predictions.
+#'
+#' @param col Column of resistance predictions
+#'
+#' @return List of T/F.
+#'
+ResPredIsFinal <- function(col) {
+    a <- factor(col, levels = c("S", "S!", "R!", "R"))
+    final.states <- head(unique(rev(a)), 2)
+    is.final <-  a == final.states[1] | a == final.states[2]
+    is.final
+}
+
+
+#' Find stabilization points
+#'
+#' @param col
+#'
+#' @return List of T/F.
+#'
+StabilizationPoint <- function(col) {
+    # detect last block
+    r <- rle(col)
+    r$values[] <- F
+    r$values[length(r$values)] <- T
+    x <- inverse.rle(r)
+    # detect its first value
+    cumsum(x)==1
+}
+
+#' Find detection points
+#'
+#' @param col
+#'
+#' @return List of T/F.
+#'
+DetectionPoints <- function(col){
+    (col-c(0,head(col,-1)))==1
+}
+
+
+#' Find losing points
+#'
+#' @param col
+#'
+#' @return List of T/F.
+#'
+LosingPoints <- function(col){
+    (col-c(0,head(col,-1)))==-1
+}
+
+
+#' Return list marking final values of category predictions.
+#'
+#' @param col Column of category predictions
+#'
+#' @return List of T/F.
+#'
+CatPredIsFinal <- function(col) {
+    final.states <- head(unique(rev(col)), 1)
+    is.final <- col == final.states[1]
+    is.final
+}
+
+
 #' Create a data frame with flag points
 #'
 #' @param df Input dataframe

From eebe120a149eb6057084094dc1698f7e8c794218 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Wed, 17 Jul 2019 16:22:28 -0400
Subject: [PATCH 5/6] Update documentation of prediction output

---
 readme.md | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/readme.md b/readme.md
index 16cf0d3..b0d26c0 100644
--- a/readme.md
+++ b/readme.md
@@ -22,11 +22,10 @@
 
 ## Introduction
 
-This repository contains the RASE prediction pipeline. The method uses lineage
-calling to identify antibiotic resistant clones from nanopore reads. In our
-[paper](https://www.biorxiv.org/content/early/2018/08/29/403204), we
-demonstrate on the example of pneumococcus that, using this approach,
-antibiotic resistance can be predicted within minutes from the start of
+This repository contains the RASE prediction pipeline. The method uses
+neighbortyping to infer antibiotic resistance and susceptibility from nanopore
+reads. In our [paper](https://www.biorxiv.org/content/early/2018/08/29/403204),
+we demonstrate on the examples of pneumococcus and gonococcus that using this approach antibiotic resistance can be predicted within minutes from the start of
 sequencing. Please, look at the paper for more information.
 
 
@@ -46,7 +45,7 @@ database are pre-processed: reads are sorted by time of sequencing and the
 database gets uncompressed (i.e., the full internal ProPhyle k-mer index
 restored).  Subsequently, nanopore reads from individual experiments are
 compared to the database(s) using [ProPhyle](http://prophyle.github.io), and
-isolate, phylogroup and resistance to individual antibiotics predicted - all
+isolate, lineage and resistance to individual antibiotics predicted - all
 as a function of time.  Finally, the obtained time characteristics, as well as
 rank plots for selected moments, are visualized using R.
 
@@ -292,27 +291,26 @@ help`.
 
   | column | description |
   | --- | --- |
-  | `datetime` | datetime of sequencing or data processing |
+  | `datetime` | datetime of the read |
   | `reads` | number of processed reads |
-  | `bps` | number of processed basepairs |
-  | `matched bps` | number of matched basepairs (via k-mers) |
-  | `pgs` | phylogroup score |
-  | `pgs_ok` | phylogroup score interpretation, `pass` and `fail` for _passing_ and _failing_, respectively |
-  | `pg1`, `pg2` | predicted and alternative phylogroup, respectively |
-  | `pg1_bm`, `pg2_bm` | best-matching isolate within the predicted and alternative phylogroup, respectively |
+  | `kbps` | number of processed bps (thousands) |
+  | `kkmers` | number of matched k-mers (thousands) |
+  | `kmers_prop` | proportion of matched k-mers |
+  | `lns` | lineage score |
+  | `lns_pass` | lineage score interpretation, 1=_passing_ 0=_failing_ |
+  | `ln`, `alt_ln` | predicted and alternative lineage, respectively |
+  | `bm`, `bm_{prop}` | best-matching isolate (nearest neighbor) and its properties |
   | `pg1_w`, `pg2_w` | their weights |
-  | `{ant}_sus` | susceptibility score of the antibiotic `{ant}` |
-  | `{ant}_pr_cat` | susceptibility score interpretation: `S` , `S!`, and `R` for _susceptible best match_, _susceptible best match but high risk of resistance_, _resistant best match_, respectively |
-  | `{ant}_bm_cat` | category of the best matching isolate: `R`, `S`, `r`, `s` for _resistant_, _susceptible_, _unknown and inferred as resistant_, and _unknown and inferred as susceptible_, respectively |
-  | `{ant}_r_bm`, `{ant}_s_bm` | best-matching resistant and susceptible isolate within the phylogroup, respectively |
-  | `{ant}_r_w`, `{ant}_s_w` | their weights |
+  | `{ant}_ssc` | susceptibility score |
+  | `{ant}_pred` | interpretation: `S`=susceptible, `R`=non-susceptible, `S!` and `R!`=low confidence calls |
+  | `{ant}_bm` | resistance information for the best match, `cat (mic)` |
 
 * **Prediction output (snapshot).** Tab-separated text file with the following columns:
 
   | column | description |
   | --- | --- |
   | `taxid` | taxid of a database isolate, `_unassigned_` for reads without any k-mer matches with the database |
-  | `phylogroup` | phylogroup |
+  | `lineage` | lineage |
   | `weight` | weight (cumulative "number of k-mer best matches divided by the number of matches") |
   | `weight_norm` | normalized `weight` |
   | `ln` | cumulative "read length divided by number of matches" |

From f2530e5903dd7cb4bd709469428238a65b53ab88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Wed, 17 Jul 2019 16:23:31 -0400
Subject: [PATCH 6/6] Revert "Update documentation of prediction output"

This reverts commit eebe120a149eb6057084094dc1698f7e8c794218.
---
 readme.md | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/readme.md b/readme.md
index b0d26c0..16cf0d3 100644
--- a/readme.md
+++ b/readme.md
@@ -22,10 +22,11 @@
 
 ## Introduction
 
-This repository contains the RASE prediction pipeline. The method uses
-neighbortyping to infer antibiotic resistance and susceptibility from nanopore
-reads. In our [paper](https://www.biorxiv.org/content/early/2018/08/29/403204),
-we demonstrate on the examples of pneumococcus and gonococcus that using this approach antibiotic resistance can be predicted within minutes from the start of
+This repository contains the RASE prediction pipeline. The method uses lineage
+calling to identify antibiotic resistant clones from nanopore reads. In our
+[paper](https://www.biorxiv.org/content/early/2018/08/29/403204), we
+demonstrate on the example of pneumococcus that, using this approach,
+antibiotic resistance can be predicted within minutes from the start of
 sequencing. Please, look at the paper for more information.
 
 
@@ -45,7 +46,7 @@ database are pre-processed: reads are sorted by time of sequencing and the
 database gets uncompressed (i.e., the full internal ProPhyle k-mer index
 restored).  Subsequently, nanopore reads from individual experiments are
 compared to the database(s) using [ProPhyle](http://prophyle.github.io), and
-isolate, lineage and resistance to individual antibiotics predicted - all
+isolate, phylogroup and resistance to individual antibiotics predicted - all
 as a function of time.  Finally, the obtained time characteristics, as well as
 rank plots for selected moments, are visualized using R.
 
@@ -291,26 +292,27 @@ help`.
 
   | column | description |
   | --- | --- |
-  | `datetime` | datetime of the read |
+  | `datetime` | datetime of sequencing or data processing |
   | `reads` | number of processed reads |
-  | `kbps` | number of processed bps (thousands) |
-  | `kkmers` | number of matched k-mers (thousands) |
-  | `kmers_prop` | proportion of matched k-mers |
-  | `lns` | lineage score |
-  | `lns_pass` | lineage score interpretation, 1=_passing_ 0=_failing_ |
-  | `ln`, `alt_ln` | predicted and alternative lineage, respectively |
-  | `bm`, `bm_{prop}` | best-matching isolate (nearest neighbor) and its properties |
+  | `bps` | number of processed basepairs |
+  | `matched bps` | number of matched basepairs (via k-mers) |
+  | `pgs` | phylogroup score |
+  | `pgs_ok` | phylogroup score interpretation, `pass` and `fail` for _passing_ and _failing_, respectively |
+  | `pg1`, `pg2` | predicted and alternative phylogroup, respectively |
+  | `pg1_bm`, `pg2_bm` | best-matching isolate within the predicted and alternative phylogroup, respectively |
   | `pg1_w`, `pg2_w` | their weights |
-  | `{ant}_ssc` | susceptibility score |
-  | `{ant}_pred` | interpretation: `S`=susceptible, `R`=non-susceptible, `S!` and `R!`=low confidence calls |
-  | `{ant}_bm` | resistance information for the best match, `cat (mic)` |
+  | `{ant}_sus` | susceptibility score of the antibiotic `{ant}` |
+  | `{ant}_pr_cat` | susceptibility score interpretation: `S` , `S!`, and `R` for _susceptible best match_, _susceptible best match but high risk of resistance_, _resistant best match_, respectively |
+  | `{ant}_bm_cat` | category of the best matching isolate: `R`, `S`, `r`, `s` for _resistant_, _susceptible_, _unknown and inferred as resistant_, and _unknown and inferred as susceptible_, respectively |
+  | `{ant}_r_bm`, `{ant}_s_bm` | best-matching resistant and susceptible isolate within the phylogroup, respectively |
+  | `{ant}_r_w`, `{ant}_s_w` | their weights |
 
 * **Prediction output (snapshot).** Tab-separated text file with the following columns:
 
   | column | description |
   | --- | --- |
   | `taxid` | taxid of a database isolate, `_unassigned_` for reads without any k-mer matches with the database |
-  | `lineage` | lineage |
+  | `phylogroup` | phylogroup |
   | `weight` | weight (cumulative "number of k-mer best matches divided by the number of matches") |
   | `weight_norm` | normalized `weight` |
   | `ln` | cumulative "read length divided by number of matches" |