sanger-tol · reichan1998 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/assets/vectorDB.fasta.gz b/assets/vectorDB.fasta.gz
diff --git a/bin/pacbio_filter.sh b/bin/pacbio_filter.sh
@@ -3,5 +3,4 @@
 input=$1
 output=$2
 
-grep -v 'MG551957' $input | awk -v OFS='\t' '{if (($2 ~ /NGB00972/ && $3 >= 97 && $4 >= 44) || ($2 ~ /NGB00973/ && $3 >= 97 && $4 >= 34) || ($2 ~ /^bc/ && $3 >= 99 && $4 >= 16)) print $1}' | sort -u > $output
-
+grep -v 'MG551957' $input | awk -v OFS='\t' '{if (($6 ~ /NGB00972/ && $10/$11 >= 0.97 && $10 >= 44) || ($6 ~ /NGB00973/ && $10/$11 >= 0.97 && $10 >= 34) || ($6 ~ /^bc/ && $10/$11 >= 0.99 && $10 >= 16)) print $1}' | sort -u > $output
diff --git a/conf/base.config b/conf/base.config
@@ -52,10 +52,10 @@ process {
         time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
     }
 
-    withName: BLAST_BLASTN {
-        time   = { check_max(          2.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
-        memory = { check_max( 100.MB + 20.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
-        // The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default)
+    withName: 'MINIMAP2_SPLICE' {
+        cpus   = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) }
+        memory = { check_max( 500.MB + 100.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
+        time   = { check_max(          4.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
     }
 
     withName: BWAMEM2_INDEX {

diff --git a/conf/modules.config b/conf/modules.config
@@ -38,10 +38,6 @@ process {
         ext.args   = { (params.use_work_dir_as_temp ? "-T." : "") }
     }
 
-    withName: BLAST_BLASTN {
-        ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6'
-    }
-
     withName: SAMTOOLS_CONVERT {
         ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index"
     }
@@ -63,6 +59,11 @@ process {
         ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' }
     }
 
+    withName: 'MINIMAP2_SPLICE' {
+        ext.args = '-x splice:hq'
+        ext.prefix = { "${meta.id}" }
+    }
+
     withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' {
         ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" }
         ext.args   = '--output-fmt cram --write-index'

diff --git a/docs/output.md b/docs/output.md
@@ -28,7 +28,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 ### Filtering
 
-PacBio reads generated using both CLR and CCS technology are filtered using `BLAST_BLASTN` against a database of adapter sequences. The collated FASTQ of the filtered reads is required by the downstream alignment step. The results from the PacBio filtering subworkflow are currently not set to output.
+PacBio reads generated using both CLR and CCS technology are filtered using `MINIMAP2_SPLICE` against a database of adapter sequences. The `splice` mode can be effectively used to filter PacBio reads, allowing for the identification and removal of these sequences from the dataset. The collated FASTQ of the filtered reads is required by the downstream alignment step. The results from the PacBio filtering subworkflow are currently not set to output.
 
 ## Alignment and Mark duplicates
 

diff --git a/modules/local/minimap2_splice.nf b/modules/local/minimap2_splice.nf
@@ -0,0 +1,39 @@
+process MINIMAP2_SPLICE {
+    tag "$meta.id"
+    label 'process_high'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/minimap2:2.24--h5bf99c6_0':
+        'biocontainers/minimap2:2.24--h5bf99c6_0' }"
+
+    input:
+    tuple val(meta) , path(fasta)
+    path(db)
+
+    output:
+    tuple val(meta), path('*.paf'), emit: paf
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def is_compressed = fasta.getExtension() == "gz" ? true : false
+    def fasta_name = is_compressed ? fasta.getBaseName() : fasta
+
+    """
+    if [ "${is_compressed}" == "true" ]; then
+        gzip -c -d ${fasta} > ${fasta_name}
+    fi
+
+    minimap2 $args -t $task.cpus ${db} ${fasta_name} > ${prefix}.paf
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        minimap2: \$(minimap2 --version | grep minimap2 | sed 's/minimap2 //')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/pacbio_filter.nf b/modules/local/pacbio_filter.nf
@@ -8,7 +8,7 @@ process PACBIO_FILTER {
         'biocontainers/gawk:5.1.0' }"
 
     input:
-    tuple val(meta), path(txt)
+    tuple val(meta), path(paf)
 
     output:
     tuple val(meta), path("*.blocklist"), emit: list
@@ -20,7 +20,7 @@ process PACBIO_FILTER {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    pacbio_filter.sh $txt ${prefix}.blocklist
+    pacbio_filter.sh $paf ${prefix}.blocklist
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -11,7 +11,7 @@ params {
 
     // Input options
     input                      = null
-    vector_db                  = "${projectDir}/assets/vectorDB.tar.gz"
+    vector_db                  = "${projectDir}/assets/vectorDB.fasta.gz"
     bwamem2_index              = null
     fasta                      = null
     header                     = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -31,9 +31,9 @@
                 },
                 "vector_db": {
                     "type": "string",
-                    "description": "Path to directory or tar.gz archive for pre-built PacBio vector database.",
+                    "description": "Path to PacBio vector reference (.fasta, .fasta.gz, .fa, .fa.gz)",
                     "format": "path",
-                    "default": "${projectDir}/assets/vectorDB.tar.gz",
+                    "default": "${projectDir}/assets/vectorDB.fasta.gz",
                     "fa_icon": "fas fa-bezier-curve"
                 },
                 "email": {

diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf
@@ -5,7 +5,7 @@
 
 include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtools/view/main'
 include { SAMTOOLS_COLLATETOFASTA           } from '../../modules/local/samtools_collatetofasta'
-include { BLAST_BLASTN                      } from '../../modules/nf-core/blast/blastn/main'
+include { MINIMAP2_ALIGN as MINIMAP2_SPLICE } from '../../modules/nf-core/minimap2/align/main'
 include { PACBIO_FILTER                     } from '../../modules/local/pacbio_filter'
 include { SAMTOOLS_FILTERTOFASTQ            } from '../../modules/local/samtools_filtertofastq'
 
@@ -34,13 +34,13 @@ workflow FILTER_PACBIO {
     ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() )
 
 
-    // Nucleotide BLAST
-    BLAST_BLASTN ( SAMTOOLS_COLLATETOFASTA.out.fasta, db )
-    ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() )
+    // Minimap2 splice
+    MINIMAP2_SPLICE ( SAMTOOLS_COLLATETOFASTA.out.fasta, [ [], [db] ], false, "bai", false, false  )
+    ch_versions = ch_versions.mix ( MINIMAP2_SPLICE.out.versions.first() )
 
 
-    // Filter BLAST output
-    PACBIO_FILTER ( BLAST_BLASTN.out.txt )
+    // Filter MINIMAP2_SPLICE output
+    PACBIO_FILTER ( MINIMAP2_SPLICE.out.paf )
     ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() )
 
 

diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf
@@ -84,25 +84,6 @@ workflow READMAPPING {
     PREPARE_GENOME ( ch_genome )
     ch_versions = ch_versions.mix ( PREPARE_GENOME.out.versions )
 
-
-    //
-    // Create channel for vector DB
-    //
-    // ***PacBio condition does not work - needs fixing***
-    if ( ch_reads.pacbio || ch_reads.clr ) {
-        if ( params.vector_db.endsWith( '.tar.gz' ) ) {
-            UNTAR ( [ [:], params.vector_db ] ).untar
-            | set { ch_vector_db }
-
-            ch_versions = ch_versions.mix ( UNTAR.out.versions )
-
-        } else {
-            Channel.fromPath ( params.vector_db )
-            | set { ch_vector_db }
-        }
-    }
-
-
     //
     // SUBWORKFLOW: Align raw reads to genome
     //
@@ -112,10 +93,10 @@ workflow READMAPPING {
     ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina )
     ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions )
 
-    ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, ch_vector_db )
+    ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, params.vector_db )
     ch_versions = ch_versions.mix ( ALIGN_HIFI.out.versions )
 
-    ALIGN_CLR ( PREPARE_GENOME.out.fasta, ch_reads.clr, ch_vector_db )
+    ALIGN_CLR ( PREPARE_GENOME.out.fasta, ch_reads.clr, params.vector_db )
     ch_versions = ch_versions.mix ( ALIGN_CLR.out.versions )
 
     ALIGN_ONT ( PREPARE_GENOME.out.fasta, ch_reads.ont )