From ee2da328ee8d40b0270306015b8814ccaddd73e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 May 2019 17:53:35 -0400 Subject: [PATCH 1/6] Sync header --- pipeline/tests/predict.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/tests/predict.tsv b/pipeline/tests/predict.tsv index de5d22d..f66d9ef 100644 --- a/pipeline/tests/predict.tsv +++ b/pipeline/tests/predict.tsv @@ -1,4 +1,4 @@ -datetime reads kbps kkmers kmers_prop pgs pgs_pass pg alt_pg bm bm_ST bm_serotype pen_sus pen_pred pen_bm cro_sus cro_pred cro_bm tmp_sus tmp_pred tmp_bm ery_sus ery_pred ery_bm tet_sus tet_pred tet_bm flags +datetime reads kbps kkmers kmers_prop pgs pgs_pass pg alt_pg bm bm_ST bm_serotype pen_ssc pen_pred pen_bm cro_ssc cro_pred cro_bm tmp_ssc tmp_pred tmp_bm ery_ssc ery_pred ery_bm tet_ssc tet_pred tet_bm flags 2017-05-26 17:29:52 0 0 0 0.000 0 0 NA NA NA NA NA 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) [] 2017-05-26 17:30:52 137 319 19 0.060 0.608 1 3 16 8QTW4 63 15A 0.141 R R (0.25) 0.813 S S (0.094) 0.659 S! S (0.19) 0.133 R R (32) 0.141 R r (NA) [D:bm, D:pg, S:bm, S:pg] 2017-05-26 17:31:52 278 646 39 0.061 0.590 1 3 2 8QTW4 63 15A 0.125 R R (0.25) 0.844 S S (0.094) 0.746 S! S (0.19) 0.312 R! R (32) 0.312 R! r (NA) [] From 5678e5d616d38575365b7a8203b4b910f62d54e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 May 2019 17:53:51 -0400 Subject: [PATCH 2/6] Add comments --- src/rase/rase_prediction_add_flags.py | 43 ++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/rase/rase_prediction_add_flags.py b/src/rase/rase_prediction_add_flags.py index 66661f6..8747283 100755 --- a/src/rase/rase_prediction_add_flags.py +++ b/src/rase/rase_prediction_add_flags.py @@ -17,13 +17,24 @@ import re +def load_tsv_dl(tsv_fn): + """Load TSV as a list of dictionaries (1 dict per 1 line). + """ + with open(tsv_fn) as f: + tr = [x for x in csv.DictReader(f, delimiter="\t")] + return tr + + def get_cols(tsv_dl): + """Get columns from a list of dictionaries. + """ d = tsv_dl[0] return [x for x in d.keys()] def extract_flag_cols(cols): - # regular expressions for categories to be flagged + """Extract columns for which flags are to be computed. + """ res_flagging = [ re.compile(r"^bm$"), re.compile(r"^serotype$"), @@ -41,14 +52,11 @@ def extract_flag_cols(cols): return flag_cols -def load_tsv_dl(tsv_fn): - with open(tsv_fn) as f: - tr = [x for x in csv.DictReader(f, delimiter="\t")] - return tr - - def get_stabilization_point(tsv_dl, key): - tsv_dl_ = [collections.defaultdict(lambda: "")] + tsv_dl + """Find points of stabilization. + """ + tsv_dl_ = [collections.defaultdict(lambda: "") + ] + tsv_dl # probably to support empty files? final_value = tsv_dl_[-1][key] for i in range(len(tsv_dl_) - 1, -1, -1): if tsv_dl_[i][key] != final_value: @@ -56,7 +64,26 @@ def get_stabilization_point(tsv_dl, key): return i +def add_flags_line(): + for k in flag_cols: + if i == stabilization_points[k]: + flags.append('S:{}'.format(k)) + assert rec[k] != prev_rec[k] + if rec[k] != prev_rec[k]: + if rec[k] == last_rec[k]: + # detected + flags.append('D:{}'.format(k)) + elif prev_rec[k] == last_rec[k]: + # lost successfuly detected + flags.append('L:{}'.format(k)) + flags.sort() + flags_str = str(flags).replace("'", "") + print(*rec.values(), flags_str, sep="\t") + + def add_flags(tsv_fn): + """Read RASE prediction output and add flags. + """ tsv_dl = load_tsv_dl(tsv_fn) cols = get_cols(tsv_dl) flag_cols = extract_flag_cols(cols) From 8f08c6e612cb7e547b44f2a0e0d97c7007e52dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 May 2019 18:04:25 -0400 Subject: [PATCH 3/6] Make the test more interesting --- pipeline/tests/predict.tsv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/tests/predict.tsv b/pipeline/tests/predict.tsv index f66d9ef..7fec93d 100644 --- a/pipeline/tests/predict.tsv +++ b/pipeline/tests/predict.tsv @@ -1,5 +1,5 @@ datetime reads kbps kkmers kmers_prop pgs pgs_pass pg alt_pg bm bm_ST bm_serotype pen_ssc pen_pred pen_bm cro_ssc cro_pred cro_bm tmp_ssc tmp_pred tmp_bm ery_ssc ery_pred ery_bm tet_ssc tet_pred tet_bm flags -2017-05-26 17:29:52 0 0 0 0.000 0 0 NA NA NA NA NA 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) [] +2017-05-26 17:29:52 0 0 0 0.000 0 0 NA NA NA NA NA 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) 0.000 R NA (NA) 0.000 S NA (NA) [] 2017-05-26 17:30:52 137 319 19 0.060 0.608 1 3 16 8QTW4 63 15A 0.141 R R (0.25) 0.813 S S (0.094) 0.659 S! S (0.19) 0.133 R R (32) 0.141 R r (NA) [D:bm, D:pg, S:bm, S:pg] 2017-05-26 17:31:52 278 646 39 0.061 0.590 1 3 2 8QTW4 63 15A 0.125 R R (0.25) 0.844 S S (0.094) 0.746 S! S (0.19) 0.312 R! R (32) 0.312 R! r (NA) [] 2017-05-26 17:32:53 414 973 59 0.060 0.672 1 3 2 8QTW4 63 15A 0.135 R R (0.25) 0.824 S S (0.094) 0.754 S! S (0.19) 0.276 R! R (32) 0.276 R! r (NA) [] From 267804ea97e616083be3569b987a7709507a8309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 29 May 2019 18:57:16 -0400 Subject: [PATCH 4/6] Add functions for finding detection, stabilization and losing points --- scripts/rase_plot_timeline.R | 65 ++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/scripts/rase_plot_timeline.R b/scripts/rase_plot_timeline.R index adfda0a..7100b66 100755 --- a/scripts/rase_plot_timeline.R +++ b/scripts/rase_plot_timeline.R @@ -155,6 +155,71 @@ DfToAnts <- function(df) { } +#' Return list marking final values of resistance predictions. +#' +#' @param col Column of resistance predictions +#' +#' @return List of T/F. +#' +ResPredIsFinal <- function(col) { + a <- factor(col, levels = c("S", "S!", "R!", "R")) + final.states <- head(unique(rev(a)), 2) + is.final <- a == final.states[1] | a == final.states[2] + is.final +} + + +#' Find stabilization points +#' +#' @param col +#' +#' @return List of T/F. +#' +StabilizationPoint <- function(col) { + # detect last block + r <- rle(col) + r$values[] <- F + r$values[length(r$values)] <- T + x <- inverse.rle(r) + # detect its first value + cumsum(x)==1 +} + +#' Find detection points +#' +#' @param col +#' +#' @return List of T/F. +#' +DetectionPoints <- function(col){ + (col-c(0,head(col,-1)))==1 +} + + +#' Find losing points +#' +#' @param col +#' +#' @return List of T/F. +#' +LosingPoints <- function(col){ + (col-c(0,head(col,-1)))==-1 +} + + +#' Return list marking final values of category predictions. +#' +#' @param col Column of category predictions +#' +#' @return List of T/F. +#' +CatPredIsFinal <- function(col) { + final.states <- head(unique(rev(col)), 1) + is.final <- col == final.states[1] + is.final +} + + #' Create a data frame with flag points #' #' @param df Input dataframe From eebe120a149eb6057084094dc1698f7e8c794218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 17 Jul 2019 16:22:28 -0400 Subject: [PATCH 5/6] Update documentation of prediction output --- readme.md | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/readme.md b/readme.md index 16cf0d3..b0d26c0 100644 --- a/readme.md +++ b/readme.md @@ -22,11 +22,10 @@ ## Introduction -This repository contains the RASE prediction pipeline. The method uses lineage -calling to identify antibiotic resistant clones from nanopore reads. In our -[paper](https://www.biorxiv.org/content/early/2018/08/29/403204), we -demonstrate on the example of pneumococcus that, using this approach, -antibiotic resistance can be predicted within minutes from the start of +This repository contains the RASE prediction pipeline. The method uses +neighbortyping to infer antibiotic resistance and susceptibility from nanopore +reads. In our [paper](https://www.biorxiv.org/content/early/2018/08/29/403204), +we demonstrate on the examples of pneumococcus and gonococcus that using this approach antibiotic resistance can be predicted within minutes from the start of sequencing. Please, look at the paper for more information. @@ -46,7 +45,7 @@ database are pre-processed: reads are sorted by time of sequencing and the database gets uncompressed (i.e., the full internal ProPhyle k-mer index restored). Subsequently, nanopore reads from individual experiments are compared to the database(s) using [ProPhyle](http://prophyle.github.io), and -isolate, phylogroup and resistance to individual antibiotics predicted - all +isolate, lineage and resistance to individual antibiotics predicted - all as a function of time. Finally, the obtained time characteristics, as well as rank plots for selected moments, are visualized using R. @@ -292,27 +291,26 @@ help`. | column | description | | --- | --- | - | `datetime` | datetime of sequencing or data processing | + | `datetime` | datetime of the read | | `reads` | number of processed reads | - | `bps` | number of processed basepairs | - | `matched bps` | number of matched basepairs (via k-mers) | - | `pgs` | phylogroup score | - | `pgs_ok` | phylogroup score interpretation, `pass` and `fail` for _passing_ and _failing_, respectively | - | `pg1`, `pg2` | predicted and alternative phylogroup, respectively | - | `pg1_bm`, `pg2_bm` | best-matching isolate within the predicted and alternative phylogroup, respectively | + | `kbps` | number of processed bps (thousands) | + | `kkmers` | number of matched k-mers (thousands) | + | `kmers_prop` | proportion of matched k-mers | + | `lns` | lineage score | + | `lns_pass` | lineage score interpretation, 1=_passing_ 0=_failing_ | + | `ln`, `alt_ln` | predicted and alternative lineage, respectively | + | `bm`, `bm_{prop}` | best-matching isolate (nearest neighbor) and its properties | | `pg1_w`, `pg2_w` | their weights | - | `{ant}_sus` | susceptibility score of the antibiotic `{ant}` | - | `{ant}_pr_cat` | susceptibility score interpretation: `S` , `S!`, and `R` for _susceptible best match_, _susceptible best match but high risk of resistance_, _resistant best match_, respectively | - | `{ant}_bm_cat` | category of the best matching isolate: `R`, `S`, `r`, `s` for _resistant_, _susceptible_, _unknown and inferred as resistant_, and _unknown and inferred as susceptible_, respectively | - | `{ant}_r_bm`, `{ant}_s_bm` | best-matching resistant and susceptible isolate within the phylogroup, respectively | - | `{ant}_r_w`, `{ant}_s_w` | their weights | + | `{ant}_ssc` | susceptibility score | + | `{ant}_pred` | interpretation: `S`=susceptible, `R`=non-susceptible, `S!` and `R!`=low confidence calls | + | `{ant}_bm` | resistance information for the best match, `cat (mic)` | * **Prediction output (snapshot).** Tab-separated text file with the following columns: | column | description | | --- | --- | | `taxid` | taxid of a database isolate, `_unassigned_` for reads without any k-mer matches with the database | - | `phylogroup` | phylogroup | + | `lineage` | lineage | | `weight` | weight (cumulative "number of k-mer best matches divided by the number of matches") | | `weight_norm` | normalized `weight` | | `ln` | cumulative "read length divided by number of matches" | From f2530e5903dd7cb4bd709469428238a65b53ab88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Wed, 17 Jul 2019 16:23:31 -0400 Subject: [PATCH 6/6] Revert "Update documentation of prediction output" This reverts commit eebe120a149eb6057084094dc1698f7e8c794218. --- readme.md | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/readme.md b/readme.md index b0d26c0..16cf0d3 100644 --- a/readme.md +++ b/readme.md @@ -22,10 +22,11 @@ ## Introduction -This repository contains the RASE prediction pipeline. The method uses -neighbortyping to infer antibiotic resistance and susceptibility from nanopore -reads. In our [paper](https://www.biorxiv.org/content/early/2018/08/29/403204), -we demonstrate on the examples of pneumococcus and gonococcus that using this approach antibiotic resistance can be predicted within minutes from the start of +This repository contains the RASE prediction pipeline. The method uses lineage +calling to identify antibiotic resistant clones from nanopore reads. In our +[paper](https://www.biorxiv.org/content/early/2018/08/29/403204), we +demonstrate on the example of pneumococcus that, using this approach, +antibiotic resistance can be predicted within minutes from the start of sequencing. Please, look at the paper for more information. @@ -45,7 +46,7 @@ database are pre-processed: reads are sorted by time of sequencing and the database gets uncompressed (i.e., the full internal ProPhyle k-mer index restored). Subsequently, nanopore reads from individual experiments are compared to the database(s) using [ProPhyle](http://prophyle.github.io), and -isolate, lineage and resistance to individual antibiotics predicted - all +isolate, phylogroup and resistance to individual antibiotics predicted - all as a function of time. Finally, the obtained time characteristics, as well as rank plots for selected moments, are visualized using R. @@ -291,26 +292,27 @@ help`. | column | description | | --- | --- | - | `datetime` | datetime of the read | + | `datetime` | datetime of sequencing or data processing | | `reads` | number of processed reads | - | `kbps` | number of processed bps (thousands) | - | `kkmers` | number of matched k-mers (thousands) | - | `kmers_prop` | proportion of matched k-mers | - | `lns` | lineage score | - | `lns_pass` | lineage score interpretation, 1=_passing_ 0=_failing_ | - | `ln`, `alt_ln` | predicted and alternative lineage, respectively | - | `bm`, `bm_{prop}` | best-matching isolate (nearest neighbor) and its properties | + | `bps` | number of processed basepairs | + | `matched bps` | number of matched basepairs (via k-mers) | + | `pgs` | phylogroup score | + | `pgs_ok` | phylogroup score interpretation, `pass` and `fail` for _passing_ and _failing_, respectively | + | `pg1`, `pg2` | predicted and alternative phylogroup, respectively | + | `pg1_bm`, `pg2_bm` | best-matching isolate within the predicted and alternative phylogroup, respectively | | `pg1_w`, `pg2_w` | their weights | - | `{ant}_ssc` | susceptibility score | - | `{ant}_pred` | interpretation: `S`=susceptible, `R`=non-susceptible, `S!` and `R!`=low confidence calls | - | `{ant}_bm` | resistance information for the best match, `cat (mic)` | + | `{ant}_sus` | susceptibility score of the antibiotic `{ant}` | + | `{ant}_pr_cat` | susceptibility score interpretation: `S` , `S!`, and `R` for _susceptible best match_, _susceptible best match but high risk of resistance_, _resistant best match_, respectively | + | `{ant}_bm_cat` | category of the best matching isolate: `R`, `S`, `r`, `s` for _resistant_, _susceptible_, _unknown and inferred as resistant_, and _unknown and inferred as susceptible_, respectively | + | `{ant}_r_bm`, `{ant}_s_bm` | best-matching resistant and susceptible isolate within the phylogroup, respectively | + | `{ant}_r_w`, `{ant}_s_w` | their weights | * **Prediction output (snapshot).** Tab-separated text file with the following columns: | column | description | | --- | --- | | `taxid` | taxid of a database isolate, `_unassigned_` for reads without any k-mer matches with the database | - | `lineage` | lineage | + | `phylogroup` | phylogroup | | `weight` | weight (cumulative "number of k-mer best matches divided by the number of matches") | | `weight_norm` | normalized `weight` | | `ln` | cumulative "read length divided by number of matches" |