Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace BlastN by Minimap2 splice. #104

Closed
wants to merge 13 commits into from
Binary file added assets/vectorDB.fasta.gz
Binary file not shown.
3 changes: 1 addition & 2 deletions bin/pacbio_filter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@
input=$1
output=$2

grep -v 'MG551957' $input | awk -v OFS='\t' '{if (($2 ~ /NGB00972/ && $3 >= 97 && $4 >= 44) || ($2 ~ /NGB00973/ && $3 >= 97 && $4 >= 34) || ($2 ~ /^bc/ && $3 >= 99 && $4 >= 16)) print $1}' | sort -u > $output

grep -v 'MG551957' $input | awk -v OFS='\t' '{if (($6 ~ /NGB00972/ && $10/$11 >= 0.97 && $10 >= 44) || ($6 ~ /NGB00973/ && $10/$11 >= 0.97 && $10 >= 34) || ($6 ~ /^bc/ && $10/$11 >= 0.99 && $10 >= 16)) print $1}' | sort -u > $output
8 changes: 4 additions & 4 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ process {
time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
}

withName: BLAST_BLASTN {
time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
// The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default)
withName: 'MINIMAP2_SPLICE' {
cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) }
memory = { check_max( 500.MB + 100.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
time = { check_max( 4.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
}

withName: BWAMEM2_INDEX {
Expand Down
9 changes: 5 additions & 4 deletions conf/modules.config
tkchafin marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ process {
ext.args = { (params.use_work_dir_as_temp ? "-T." : "") }
}

withName: BLAST_BLASTN {
ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6'
}

withName: SAMTOOLS_CONVERT {
ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index"
}
Expand All @@ -63,6 +59,11 @@ process {
ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' }
}

withName: 'MINIMAP2_SPLICE' {
ext.args = '-x splice:hq'
ext.prefix = { "${meta.id}" }
}

withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' {
ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" }
ext.args = '--output-fmt cram --write-index'
Expand Down
2 changes: 1 addition & 1 deletion docs/output.md
tkchafin marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d

### Filtering

PacBio reads generated using both CLR and CCS technology are filtered using `BLAST_BLASTN` against a database of adapter sequences. The collated FASTQ of the filtered reads is required by the downstream alignment step. The results from the PacBio filtering subworkflow are currently not set to output.
PacBio reads generated using both CLR and CCS technology are filtered using `MINIMAP2_SPLICE` against a database of adapter sequences. The `splice` mode can be effectively used to filter PacBio reads, allowing for the identification and removal of these sequences from the dataset. The collated FASTQ of the filtered reads is required by the downstream alignment step. The results from the PacBio filtering subworkflow are currently not set to output.

## Alignment and Mark duplicates

Expand Down
39 changes: 39 additions & 0 deletions modules/local/minimap2_splice.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
process MINIMAP2_SPLICE {
tag "$meta.id"
label 'process_high'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/minimap2:2.24--h5bf99c6_0':
'biocontainers/minimap2:2.24--h5bf99c6_0' }"

input:
tuple val(meta) , path(fasta)
path(db)

output:
tuple val(meta), path('*.paf'), emit: paf
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def is_compressed = fasta.getExtension() == "gz" ? true : false
def fasta_name = is_compressed ? fasta.getBaseName() : fasta

"""
if [ "${is_compressed}" == "true" ]; then
gzip -c -d ${fasta} > ${fasta_name}
fi

minimap2 $args -t $task.cpus ${db} ${fasta_name} > ${prefix}.paf

cat <<-END_VERSIONS > versions.yml
"${task.process}":
minimap2: \$(minimap2 --version | grep minimap2 | sed 's/minimap2 //')
END_VERSIONS
"""
}
4 changes: 2 additions & 2 deletions modules/local/pacbio_filter.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ process PACBIO_FILTER {
'biocontainers/gawk:5.1.0' }"

input:
tuple val(meta), path(txt)
tuple val(meta), path(paf)

output:
tuple val(meta), path("*.blocklist"), emit: list
Expand All @@ -20,7 +20,7 @@ process PACBIO_FILTER {
script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
pacbio_filter.sh $txt ${prefix}.blocklist
pacbio_filter.sh $paf ${prefix}.blocklist

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
tkchafin marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ params {

// Input options
input = null
vector_db = "${projectDir}/assets/vectorDB.tar.gz"
vector_db = "${projectDir}/assets/vectorDB.fasta.gz"
bwamem2_index = null
fasta = null
header = null
Expand Down
4 changes: 2 additions & 2 deletions nextflow_schema.json
tkchafin marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
},
"vector_db": {
"type": "string",
"description": "Path to directory or tar.gz archive for pre-built PacBio vector database.",
"description": "Path to PacBio vector reference (.fasta, .fasta.gz, .fa, .fa.gz)",
"format": "path",
"default": "${projectDir}/assets/vectorDB.tar.gz",
"default": "${projectDir}/assets/vectorDB.fasta.gz",
"fa_icon": "fas fa-bezier-curve"
},
"email": {
Expand Down
12 changes: 6 additions & 6 deletions subworkflows/local/filter_pacbio.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

include { SAMTOOLS_VIEW as SAMTOOLS_CONVERT } from '../../modules/nf-core/samtools/view/main'
include { SAMTOOLS_COLLATETOFASTA } from '../../modules/local/samtools_collatetofasta'
include { BLAST_BLASTN } from '../../modules/nf-core/blast/blastn/main'
include { MINIMAP2_ALIGN as MINIMAP2_SPLICE } from '../../modules/nf-core/minimap2/align/main'
include { PACBIO_FILTER } from '../../modules/local/pacbio_filter'
include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq'

Expand Down Expand Up @@ -34,13 +34,13 @@ workflow FILTER_PACBIO {
ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() )


// Nucleotide BLAST
BLAST_BLASTN ( SAMTOOLS_COLLATETOFASTA.out.fasta, db )
ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() )
// Minimap2 splice
MINIMAP2_SPLICE ( SAMTOOLS_COLLATETOFASTA.out.fasta, [ [], [db] ], false, "bai", false, false )
ch_versions = ch_versions.mix ( MINIMAP2_SPLICE.out.versions.first() )


// Filter BLAST output
PACBIO_FILTER ( BLAST_BLASTN.out.txt )
// Filter MINIMAP2_SPLICE output
PACBIO_FILTER ( MINIMAP2_SPLICE.out.paf )
ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() )


Expand Down
23 changes: 2 additions & 21 deletions workflows/readmapping.nf
Original file line number Diff line number Diff line change
Expand Up @@ -84,25 +84,6 @@ workflow READMAPPING {
PREPARE_GENOME ( ch_genome )
ch_versions = ch_versions.mix ( PREPARE_GENOME.out.versions )


//
// Create channel for vector DB
//
// ***PacBio condition does not work - needs fixing***
if ( ch_reads.pacbio || ch_reads.clr ) {
if ( params.vector_db.endsWith( '.tar.gz' ) ) {
UNTAR ( [ [:], params.vector_db ] ).untar
| set { ch_vector_db }

ch_versions = ch_versions.mix ( UNTAR.out.versions )

} else {
Channel.fromPath ( params.vector_db )
| set { ch_vector_db }
}
}


//
// SUBWORKFLOW: Align raw reads to genome
//
Expand All @@ -112,10 +93,10 @@ workflow READMAPPING {
ALIGN_ILLUMINA ( PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.bwaidx, ch_reads.illumina )
ch_versions = ch_versions.mix ( ALIGN_ILLUMINA.out.versions )

ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, ch_vector_db )
ALIGN_HIFI ( PREPARE_GENOME.out.fasta, ch_reads.pacbio, params.vector_db )
ch_versions = ch_versions.mix ( ALIGN_HIFI.out.versions )

ALIGN_CLR ( PREPARE_GENOME.out.fasta, ch_reads.clr, ch_vector_db )
ALIGN_CLR ( PREPARE_GENOME.out.fasta, ch_reads.clr, params.vector_db )
ch_versions = ch_versions.mix ( ALIGN_CLR.out.versions )

ALIGN_ONT ( PREPARE_GENOME.out.fasta, ch_reads.ont )
Expand Down
Loading