From 7dcd9b28e33c66155cff49bdf710ed745391033c Mon Sep 17 00:00:00 2001
From: Loncova <zuzana.loncova@i-med.ac.at>
Date: Wed, 15 Dec 2021 17:44:01 +0100
Subject: [PATCH 1/5] first commit

---
 modules/preprocess.R | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 modules/preprocess.R

diff --git a/modules/preprocess.R b/modules/preprocess.R
new file mode 100644
index 0000000..3c24513
--- /dev/null
+++ b/modules/preprocess.R
@@ -0,0 +1 @@
+# code for preprocessing of the samplesheets
\ No newline at end of file

From 0cb1444f370043f9faff27097367cb7f1d1c07d3 Mon Sep 17 00:00:00 2001
From: Loncova <zuzana.loncova@i-med.ac.at>
Date: Wed, 15 Dec 2021 18:09:42 +0100
Subject: [PATCH 2/5] first commit trial

---
 modules/preprocess.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/preprocess.R b/modules/preprocess.R
index 3c24513..b9e41de 100644
--- a/modules/preprocess.R
+++ b/modules/preprocess.R
@@ -1 +1,3 @@
-# code for preprocessing of the samplesheets
\ No newline at end of file
+# code for preprocessing of the samplesheets
+
+# some lines of code
\ No newline at end of file

From bc46059bb6268f7ba29da5cd5eedb8222c9c1667 Mon Sep 17 00:00:00 2001
From: Loncova <zuzana.loncova@i-med.ac.at>
Date: Fri, 21 Jan 2022 10:44:20 +0100
Subject: [PATCH 3/5] code first version

some description
---
 modules/preprocess.R | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/modules/preprocess.R b/modules/preprocess.R
index b9e41de..41c6511 100644
--- a/modules/preprocess.R
+++ b/modules/preprocess.R
@@ -1,3 +1,27 @@
 # code for preprocessing of the samplesheets
 
-# some lines of code
\ No newline at end of file
+# At the very beginning of the pipeline, we need to verify that the samplesheet makes sense, and sanitize it.
+# For instance, the samplesheet used by nf-core/rnaseq can contain multiple rows for the same sample 
+# (in case multiple fastq files need to be merged).
+
+# Input: samplesheet as provided by user
+# Output: cleaned samplesheet with one row per sample.
+
+library (dplyr)
+
+# load input files, e.g. test data
+
+sampleAnnotation <- read_csv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/rnaseq_samplesheet_nfcore-3.1.csv")
+geneCounts <- read_tsv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/salmon.merged.gene_counts.subset.tsv")
+
+### notes: 
+### check gene counts for column names with samples
+### take it and filter samplesheets based on that - i.e. output samplesheet will have only 6 lines in total, only those, that are different
+
+cond_col = "group"
+# sample_col = NULL
+contrast = c("group", "grpA", "grpB")
+
+sampleAnnotation2 <- filter(sampleAnnotation, get(cond_col) %in% contrast[2:3])
+
+#### end end end

From 33de3e2d66e6958d8fb918f4d0b0d04f9bf657b5 Mon Sep 17 00:00:00 2001
From: Loncova <zuzana.loncova@i-med.ac.at>
Date: Fri, 21 Jan 2022 16:43:01 +0100
Subject: [PATCH 4/5] samplesheet preprocessing

first trial
---
 modules/preprocess.R | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/modules/preprocess.R b/modules/preprocess.R
index 41c6511..db10929 100644
--- a/modules/preprocess.R
+++ b/modules/preprocess.R
@@ -11,17 +11,19 @@ library (dplyr)
 
 # load input files, e.g. test data
 
-sampleAnnotation <- read_csv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/rnaseq_samplesheet_nfcore-3.1.csv")
-geneCounts <- read_tsv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/salmon.merged.gene_counts.subset.tsv")
+sampleAnnotation <- read_csv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/rnaseq_samplesheet_nfcore-3.1.csv", show_col_types = FALSE)
+geneCounts <- read_tsv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/salmon.merged.gene_counts.subset.tsv", show_col_types = F)
 
-### notes: 
-### check gene counts for column names with samples
-### take it and filter samplesheets based on that - i.e. output samplesheet will have only 6 lines in total, only those, that are different
+sampleAnnotation2 <- sampleAnnotation %>%
+  select(-fastq_1, -fastq_2) %>%
+  distinct()      # filter distinct rows
 
-cond_col = "group"
-# sample_col = NULL
-contrast = c("group", "grpA", "grpB")
+# another approach
+geneCounts2 <- select(geneCounts, -gene_id, -gene_name) %>%
+  colnames()
+outputSheet <- filter(sampleAnnotation, sample==geneCounts2) # this preserves two more columns
 
-sampleAnnotation2 <- filter(sampleAnnotation, get(cond_col) %in% contrast[2:3])
 
-#### end end end
+#output cleaned samplesheet
+print(sampleAnnotation2)
+print(outputSheet)

From 1a0bdd851da5a9cb2b84004647b6a468b8698163 Mon Sep 17 00:00:00 2001
From: Loncova <zuzana.loncova@i-med.ac.at>
Date: Fri, 21 Jan 2022 16:45:36 +0100
Subject: [PATCH 5/5] samplesheet preprocessing

first trial
---
 modules/preprocess.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/preprocess.R b/modules/preprocess.R
index db10929..21afe3a 100644
--- a/modules/preprocess.R
+++ b/modules/preprocess.R
@@ -24,6 +24,6 @@ geneCounts2 <- select(geneCounts, -gene_id, -gene_name) %>%
 outputSheet <- filter(sampleAnnotation, sample==geneCounts2) # this preserves two more columns
 
 
-#output cleaned samplesheet
+# output cleaned samplesheets
 print(sampleAnnotation2)
 print(outputSheet)