sanger-tol · tkchafin · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/assets/samplesheet_s3.csv b/assets/samplesheet_s3.csv
@@ -3,5 +3,5 @@ mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_d
 mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4%231.subset.fastq.gz,
 mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/hic/35528_2%231.subset.cram,
 mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/ont/PAE35587_pass_1f1f0707_115.subset.fastq.gz,
-mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,
+mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200910_173211.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.bam,uli
 mMelMel3,pacbio,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/genomic_data/mMelMel3/pacbio/m64094_200911_174739.ccs.bc1022_BAK8B_OA--bc1022_BAK8B_OA.subset.fastq.gz,
diff --git a/conf/base.config b/conf/base.config
@@ -46,6 +46,11 @@ process {
         memory = { check_max( 1.GB  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
     }
 
+    withName: 'PACBIO_PBMARKDUP' {
+        time   = { check_max( 4.hour * task.attempt, 'time' ) }
+        memory = { check_max( 1.GB  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
+    }
+
     withName: 'SAMTOOLS_SORMADUP' {
         cpus   = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
         memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) }

diff --git a/conf/modules.config b/conf/modules.config
@@ -41,6 +41,10 @@ process {
         ext.args   = { (params.use_work_dir_as_temp ? "-T." : "") }
     }
 
+    withName: PACBIO_PBMARKDUP {
+        ext.args   = { "--rmdup" }
+    }
+
     withName: BLAST_BLASTN {
         ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6'
     }
@@ -50,11 +54,11 @@ process {
         ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index"
     }
 
-        // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
-        // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
-        // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
-        // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values.
-        // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes
+    // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
+    // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
+    // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
+    // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values.
+    // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes
     withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' {
         ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' }
     }

diff --git a/modules/local/pbmarkdup.nf b/modules/local/pbmarkdup.nf
@@ -0,0 +1,37 @@
+process PACBIO_PBMARKDUP {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::pbmarkdup==1.0.3--h9ee0642_0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pbmarkdup:1.0.3--h9ee0642_0' :
+        'biocontainers/pbmarkdup:1.0.3--h9ee0642_0' }"
+
+    input:
+    tuple val(meta), path(input)
+
+    output:
+    tuple val(meta), path("*.{bam,f*a,/.*f.*\\.gz/}") , emit: output
+    path "versions.yml"                               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args  = task.ext.args  ?: ''
+
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def suffix = input.getExtension()
+
+    """
+    pbmarkdup \\
+        $input \\
+        ${prefix}.${suffix} \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pbmarkdup: \$(echo \$(pbmarkdup --version 2>&1) | awk 'BEFORE{FS=" "}{print \$2}')
+    END_VERSIONS
+    """
+}
diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf
@@ -2,10 +2,9 @@
 // Align PacBio read files against the genome
 //
 
-include { FILTER_PACBIO  } from '../../subworkflows/local/filter_pacbio'
-include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main'
-include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main'
-
+include { FILTER_PACBIO    } from '../../subworkflows/local/filter_pacbio'
+include { MINIMAP2_ALIGN   } from '../../modules/nf-core/minimap2/align/main'
+include { SAMTOOLS_MERGE   } from '../../modules/nf-core/samtools/merge/main'
 
 workflow ALIGN_PACBIO {
     take:

diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf
@@ -7,6 +7,7 @@ include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtoo
 include { SAMTOOLS_COLLATETOFASTA           } from '../../modules/local/samtools_collatetofasta'
 include { BLAST_BLASTN                      } from '../../modules/nf-core/blast/blastn/main'
 include { PACBIO_FILTER                     } from '../../modules/local/pacbio_filter'
+include { PACBIO_PBMARKDUP                  } from '../../modules/local/pbmarkdup'
 include { SAMTOOLS_FILTERTOFASTQ            } from '../../modules/local/samtools_filtertofastq'
 include { SEQKIT_FQ2FA                      } from '../../modules/nf-core/seqkit/fq2fa'
 include { SEQTK_SUBSEQ                      } from '../../modules/nf-core/seqtk/subseq'
@@ -22,8 +23,26 @@ workflow FILTER_PACBIO {
     ch_versions = Channel.empty()
 
 
-    // Check file types and branch
+    // Branch for handling ultra low-input libraries
     reads
+    | branch {
+        meta, reads ->
+            uli : meta.library == "uli"
+            other : true
+    }
+    | set { ch_reads_branched }
+
+    // Mark/remove duplicates
+    PACBIO_PBMARKDUP ( ch_reads_branched.uli )
+    ch_versions = ch_versions.mix ( PACBIO_PBMARKDUP.out.versions.first() )
+
+    PACBIO_PBMARKDUP.out.output
+    | mix ( ch_reads_branched.other )
+    | set { ch_reads_all }
+
+
+    // Check file types and branch
+    ch_reads_branched.other
     | branch {
         meta, reads ->
             fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ }

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -39,6 +39,7 @@ def create_data_channel ( LinkedHashMap row, datafile, stats ) {
     def meta = [:]
     meta.id         = row.sample
     meta.datatype   = row.datatype
+    meta.library    = row.library
 
     if ( meta.datatype == "hic" || meta.datatype == "illumina" ) {
         platform = "ILLUMINA"