Merge pull request #102 from tkchafin/dev

merge bwamem_index and remove unneeded multiqc options
sanger-tol · Aug 12, 2024 · eb95288 · eb95288
2 parents c07a1dd + 4e220da
commit eb95288
Show file tree

Hide file tree

Showing 8 changed files with 62 additions and 93 deletions.
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -2,10 +2,12 @@ nf_core_version: 2.14.1
 repository_type: pipeline
 template:
   name: readmapping
+  prefix: sanger-tol
 lint:
   files_exist:
     - assets/multiqc_config.yml
     - assets/nf-core-readmapping_logo_light.png
+    - assets/methods_description_template.yml
     - conf/igenomes.config
     - docs/images/nf-core-readmapping_logo_dark.png
     - docs/images/nf-core-readmapping_logo_light.png

diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -20,8 +20,7 @@
                 "meta": ["datatype"]
             },
             "datafile": {
-                "format": "file-path",
-                "exists": true,
+                "format": "string",
                 "pattern": "^\\S+$",
                 "errorMessage": "Data file for reads cannot contain spaces and must have extension 'cram', 'bam', '.fq.gz' or '.fastq.gz'",
                 "meta": ["datafile"]

diff --git a/nextflow.config b/nextflow.config
@@ -32,14 +32,8 @@ params {
     help                       = false
     version                    = false
     validate_params            = true
-    schema_ignore_params       = 'genomes'
+    schema_ignore_params       = ''
 
-    // MultiQC options
-    multiqc_config             = null
-    multiqc_title              = null
-    multiqc_logo               = null
-    max_multiqc_email_size     = '25.MB'
-    multiqc_methods_description = null
 
     // Config options
     config_profile_name        = null
@@ -58,7 +52,7 @@ params {
     // Schema validation default options
     validationFailUnrecognisedParams = false
     validationLenientMode            = false
-    validationSchemaIgnoreParams     = 'genomes,igenomes_base'
+    validationSchemaIgnoreParams     = ''
     validationShowHiddenParams       = false
     validate_params                  = true
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -42,11 +42,6 @@
                     "fa_icon": "fas fa-envelope",
                     "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.",
                     "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
-                },
-                "multiqc_title": {
-                    "type": "string",
-                    "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.",
-                    "fa_icon": "fas fa-file-signature"
                 }
             }
         },
@@ -219,14 +214,6 @@
                     "fa_icon": "fas fa-remove-format",
                     "hidden": true
                 },
-                "max_multiqc_email_size": {
-                    "type": "string",
-                    "description": "File size limit when attaching MultiQC reports to summary emails.",
-                    "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$",
-                    "default": "25.MB",
-                    "fa_icon": "fas fa-file-upload",
-                    "hidden": true
-                },
                 "monochrome_logs": {
                     "type": "boolean",
                     "description": "Do not use coloured log outputs.",
@@ -244,25 +231,6 @@
                     "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.",
                     "hidden": true
                 },
-                "multiqc_config": {
-                    "type": "string",
-                    "format": "file-path",
-                    "description": "Custom config file to supply to MultiQC.",
-                    "fa_icon": "fas fa-cog",
-                    "hidden": true
-                },
-                "multiqc_logo": {
-                    "type": "string",
-                    "description": "Directory to keep pipeline Nextflow logs and reports.",
-                    "fa_icon": "fas fa-cogs",
-                    "hidden": true
-                },
-                "multiqc_methods_description": {
-                    "type": "string",
-                    "description": "Custom MultiQC yaml file containing HTML including a methods description.",
-                    "fa_icon": "fas fa-cog",
-                    "hidden": true
-                },
                 "validate_params": {
                     "type": "boolean",
                     "description": "Boolean whether to validate parameters against the schema at runtime",
@@ -293,7 +261,6 @@
                 },
                 "schema_ignore_params": {
                     "type": "string",
-                    "default": "genomes",
                     "hidden": true
                 }
             }

diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
@@ -12,7 +12,6 @@ workflow PREPARE_GENOME {
     take:
     fasta    // channel: [ meta, /path/to/fasta ]
 
-
     main:
     ch_versions = Channel.empty()
 
@@ -33,24 +32,27 @@ workflow PREPARE_GENOME {
     UNMASK ( ch_fasta )
     ch_versions = ch_versions.mix ( UNMASK.out.versions.first() )
 
-
     // Generate BWA index
-    if ( params.bwamem2_index ) {
-        Channel.fromPath ( params.bwamem2_index )
-        | combine ( ch_fasta )
-        | map { bwa, meta, fa -> [ meta, bwa ] }
-        | set { ch_bwamem }
-
-        if ( params.bwamem2_index.endsWith('.tar.gz') ) {
-            ch_bwamem2_index = UNTAR ( ch_bwamem ).untar
-            ch_versions      = ch_versions.mix ( UNTAR.out.versions.first() )
+    if ( checkShortReads( params.input ) ) {
+        if ( params.bwamem2_index ) {
+            Channel.fromPath ( params.bwamem2_index )
+            | combine ( ch_fasta )
+            | map { bwa, meta, fa -> [ meta, bwa ] }
+            | set { ch_bwamem }
+
+            if ( params.bwamem2_index.endsWith('.tar.gz') ) {
+                ch_bwamem2_index = UNTAR ( ch_bwamem ).untar
+                ch_versions      = ch_versions.mix ( UNTAR.out.versions.first() )
+            } else {
+                ch_bwamem2_index = ch_bwamem
+            }
+
         } else {
-            ch_bwamem2_index = ch_bwamem
+            ch_bwamem2_index = BWAMEM2_INDEX ( UNMASK.out.fasta ).index
+            ch_versions      = ch_versions.mix ( BWAMEM2_INDEX.out.versions.first() )
         }
-
     } else {
-        ch_bwamem2_index = BWAMEM2_INDEX ( UNMASK.out.fasta ).index
-        ch_versions      = ch_versions.mix ( BWAMEM2_INDEX.out.versions.first() )
+        ch_bwamem2_index = Channel.empty()
     }
 
 
@@ -59,3 +61,32 @@ workflow PREPARE_GENOME {
     bwaidx   = ch_bwamem2_index.first()    // channel: [ meta, /path/to/bwamem2/index_dir/ ]
     versions = ch_versions                 // channel: [ versions.yml ]
 }
+
+//
+// Check for short reads in the samplesheet
+//
+def checkShortReads(filePath, columnToCheck="datatype") {
+    // Define the target values to check
+    def valuesToCheck = ['illumina', 'hic']
+
+    // Read the CSV file
+    def csvLines = new File(filePath).readLines()
+
+    // Extract the header and find the index of the column
+    def header = csvLines[0].split(',')
+    def columnIndex = header.findIndexOf { it == columnToCheck }
+
+    // Check if the column index was found
+    if (columnIndex == -1) {
+        error("Column '${columnToCheck}' not found in the CSV header.")
+    }
+
+    // Check for the values in the specified column and return true if found
+    def containsValues = csvLines[1..-1].any { line ->
+        def columns = line.split(',')
+        valuesToCheck.contains(columns[columnIndex].toLowerCase())
+    }
+
+    return containsValues
+}
+
diff --git a/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf b/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf
@@ -181,11 +181,20 @@ def validateInputParameters() {
 def validateInputSamplesheet(channel) {
     def seen = [:].withDefault { 0 }
     def uniquePairs = new HashSet()
+    def validFormats = [".fq.gz", ".fastq.gz", ".cram", ".bam"]
 
-    // Use map to process each item in the channel
     return channel.map { sample ->
         def (meta, file) = sample
-        def pair = [meta.sample, meta.datafile].toString()
+
+        // Replace spaces with underscores in sample names
+        meta.sample = meta.sample.replace(" ", "_")
+
+        // Validate that the file path is non-empty and has a valid format
+        if (!file || !validFormats.any { file.toString().endsWith(it) }) {
+            error("Data file is required and must have a valid extension: ${file}")
+        }
+
+        def pair = [meta.sample, file.toString()].toString()
 
         if (!uniquePairs.add(pair)) {
             error("The pair of sample name and read file must be unique: ${pair}")

diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
@@ -73,10 +73,6 @@ workflow READMAPPING {
 
     ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions )
 
-
-    //
-    // SUBWORKFLOW: Uncompress and prepare reference genome files
-    //
     ch_fasta
     | map { [ [ id: it.baseName ], it ] }
     | set { ch_genome }