diff --git a/modules/preprocess.R b/modules/preprocess.R new file mode 100644 index 0000000..21afe3a --- /dev/null +++ b/modules/preprocess.R @@ -0,0 +1,29 @@ +# code for preprocessing of the samplesheets + +# At the very beginning of the pipeline, we need to verify that the samplesheet makes sense, and sanitize it. +# For instance, the samplesheet used by nf-core/rnaseq can contain multiple rows for the same sample +# (in case multiple fastq files need to be merged). + +# Input: samplesheet as provided by user +# Output: cleaned samplesheet with one row per sample. + +library (dplyr) + +# load input files, e.g. test data + +sampleAnnotation <- read_csv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/rnaseq_samplesheet_nfcore-3.1.csv", show_col_types = FALSE) +geneCounts <- read_tsv("https://raw.githubusercontent.com/icbi-lab/nf-deseq2/main/tests/testdata/example_nfcore/salmon.merged.gene_counts.subset.tsv", show_col_types = F) + +sampleAnnotation2 <- sampleAnnotation %>% + select(-fastq_1, -fastq_2) %>% + distinct() # filter distinct rows + +# another approach +geneCounts2 <- select(geneCounts, -gene_id, -gene_name) %>% + colnames() +outputSheet <- filter(sampleAnnotation, sample==geneCounts2) # this preserves two more columns + + +# output cleaned samplesheets +print(sampleAnnotation2) +print(outputSheet)