diff --git a/modules/nf-core/simpleaf/index/environment.yml b/modules/nf-core/simpleaf/index/environment.yml index 2a6838c2bbd..7e7a1020431 100644 --- a/modules/nf-core/simpleaf/index/environment.yml +++ b/modules/nf-core/simpleaf/index/environment.yml @@ -1,8 +1,9 @@ channels: - - conda-forge - bioconda + - conda-forge dependencies: - - bioconda::alevin-fry=0.8.2 - - bioconda::salmon=1.10.2 - - bioconda::simpleaf=0.15.1 + - bioconda::alevin-fry=0.11.1 + - bioconda::piscem=0.11.0 + - bioconda::salmon=1.10.3 + - bioconda::simpleaf=0.18.4 diff --git a/modules/nf-core/simpleaf/index/main.nf b/modules/nf-core/simpleaf/index/main.nf index 37b7d647738..5959c7ddbcb 100644 --- a/modules/nf-core/simpleaf/index/main.nf +++ b/modules/nf-core/simpleaf/index/main.nf @@ -1,37 +1,40 @@ +// NOTE because the default indexer, piscem, needs to frequently read and write a large number of intermediate files, if your use case involves the situations where the CPU and storage are not physically connected, we recommend setting `--work-dir /path/to/a/local/dir` or in the `ext.args` in nextflow.config, or `scratch = true`, to avoid runtime issues. process SIMPLEAF_INDEX { - tag "$genome_fasta $transcript_fasta" + tag "${meta.id ?: meta2.id}" label 'process_high' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/simpleaf:0.15.1--h4ac6f70_0': - 'biocontainers/simpleaf:0.15.1--h4ac6f70_0' }" + 'https://depot.galaxyproject.org/singularity/simpleaf:0.18.4--ha6fb395_1': + 'biocontainers/simpleaf:0.18.4--ha6fb395_1' }" input: - tuple val(meta), path(genome_fasta) - tuple val(meta2), path(genome_gtf) - tuple val(meta3), path(transcript_fasta) + tuple val(meta), path(genome_fasta), path(genome_gtf) + tuple val(meta2), path(transcript_fasta) output: - tuple val(meta), path("${prefix}/index") , emit: index - tuple val(meta), path("${prefix}/ref/t2g_3col.tsv") , emit: transcript_tsv, optional: true - tuple val(meta), path("${prefix}") , emit: salmon - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}/index") , emit: index + tuple val(meta), path("${prefix}/ref") , emit: ref, optional: true + tuple val(meta), path("${prefix}/ref/{t2g,t2g_3col}.tsv") , emit: t2g, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def seq_inputs = (transcript_fasta) ? "--refseq $transcript_fasta" : "--gtf $genome_gtf --fasta $genome_fasta" + def seq_inputs = input_args(genome_fasta, genome_gtf, transcript_fasta)//, probes_csv, features_csv) // Output meta needs to correspond to the input used - meta = (transcript_fasta) ? meta3 : meta + meta = (transcript_fasta) ? meta2 : meta prefix = task.ext.prefix ?: "${meta.id}" """ # export required var export ALEVIN_FRY_HOME=. + # set maximum number of file descriptors for temp files + ulimit -n 2048 + # prep simpleaf simpleaf set-paths @@ -45,26 +48,49 @@ process SIMPLEAF_INDEX { cat <<-END_VERSIONS > versions.yml "${task.process}": - simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") END_VERSIONS """ stub: def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : "${meta3.id}") + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : "${meta2.id}") + """ mkdir -p ${prefix}/index mkdir -p ${prefix}/ref - touch ${prefix}/index/ctg_offsets.bin - touch ${prefix}/index/duplicate_clusters.tsv - touch ${prefix}/index/mphf.bin + touch ${prefix}/index/piscem_idx_cfish.json + touch ${prefix}/index/piscem_idx.ectab + touch ${prefix}/index/piscem_idx.sshash touch ${prefix}/ref/t2g_3col.tsv + touch ${prefix}/ref/roers_ref.fa cat <<-END_VERSIONS > versions.yml "${task.process}": - simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") END_VERSIONS """ } + +def input_args(genome_fasta, genome_gtf, transcript_fasta) { //, probes_csv, features_csv) { + // if (probe_csv) { + // args = "--probe_csv ${probe_csv}" + // } else if (feature_csv) { + // args = "--feature_csv ${feature_csv}" + // } else + if (transcript_fasta) { + return "--ref-seq ${transcript_fasta}" + } else if (genome_fasta && genome_gtf) { + return "--fasta ${genome_fasta} --gtf ${genome_gtf}" + } else { + error "No valid input provided; please provide either a genome fasta + gtf set or a transcript fasta file. ${genome_fasta} ${genome_gtf} ${transcript_fasta}" + // error "No valid input provided; please provide one of the followings: (i) a genome fasta + gtf set, (ii) a transcript fasta file, (iii) a probes csv file (iv) a features csv file." + } + +} diff --git a/modules/nf-core/simpleaf/index/meta.yml b/modules/nf-core/simpleaf/index/meta.yml index f90674af1c2..a9c5e66b90c 100644 --- a/modules/nf-core/simpleaf/index/meta.yml +++ b/modules/nf-core/simpleaf/index/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: simpleaf_index description: Indexing of transcriptome for gene expression quantification using SimpleAF keywords: @@ -17,58 +16,59 @@ input: - - meta: type: map description: | - Groovy Map containing information on genome_fasta + Groovy Map containing information on genome_fasta and genome_gtf - genome_fasta: type: file description: | - FASTA file containing the genome sequence - - - meta2: - type: map - description: | - Groovy Map containing information on genome_gtf + FASTA file containing the genome sequence. + It conflicts with transcript_fasta. + When transcript_fasta is provided, it must be empty (provided as []). + When transcript_fasta is empty, it must be provided together with its corresponding genome_gtf file. - genome_gtf: type: file description: | - GTF file containing transcript annotations. Optional if transcript FASTA file is provided. - - - meta3: + GTF file containing gene annotations. + It conflicts with transcript_fasta. + When transcript_fasta is provided, it must be empty (provided as []). + When transcript_fasta is empty, it must be provided together with its corresponding genome_fasta file. + - - meta2: type: map description: | Groovy Map containing information on transcript_fasta - transcript_fasta: type: file description: | - FASTA file containing the transcript sequences. Optional if transcript GTF file is provided. + FASTA file containing the transcript sequences to build index directly on. + It conflicts with genome_gtf and genome_fasta. + When genome_gtf and genome_fasta are provided, it must be empty (provided as []). output: - index: - meta: type: map description: | - Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used) + Groovy Map containing information on the index generated by simpleaf - ${prefix}/index: - type: directory + type: map description: | - Folder containing the Salmon index files - pattern: "salmon/index" - - transcript_tsv: + Groovy Map containing information on the index generated by simpleaf + - ref: - meta: type: map description: | - Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used) - - ${prefix}/ref/t2g_3col.tsv: - type: file + Groovy Map containing information on the transcriptomic reference constructed by simpleaf. + - ${prefix}/ref: + type: map description: | - Transcript-to-gene mapping file in 3-column TSV format - pattern: "salmon/ref/*_t2g_3col.tsv" - - salmon: + Groovy Map containing information on the transcriptomic reference constructed by simpleaf. + - t2g: - meta: - type: map + type: file description: | - Groovy Map containing information on genome_fasta or transcript_fasta (whichever was used) - - ${prefix}: - type: directory + Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant. + - ${prefix}/ref/{t2g,t2g_3col}.tsv: + type: file description: | - Folder containing the Salmon files - pattern: "salmon" + Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant. - versions: - versions.yml: type: file @@ -81,9 +81,11 @@ authors: - "@Khajidu" - "@apeltzer" - "@pinin4fjords" + - "@dongzehe" maintainers: - "@fmalmeida" - "@maxulysse" - "@Khajidu" - "@apeltzer" - "@pinin4fjords" + - "@dongzehe" diff --git a/modules/nf-core/simpleaf/index/tests/main.nf.test b/modules/nf-core/simpleaf/index/tests/main.nf.test index f21e12fe61b..d546967d62e 100644 --- a/modules/nf-core/simpleaf/index/tests/main.nf.test +++ b/modules/nf-core/simpleaf/index/tests/main.nf.test @@ -9,6 +9,7 @@ nextflow_process { tag "simpleaf" tag "simpleaf/index" + // test piscem test("Homo sapiens - genome index - expanded - fasta + gtf") { when { @@ -18,10 +19,8 @@ nextflow_process { gtf = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) meta = [ 'id': 'human_genome'] - input[0] = Channel.of([ meta, genome_fasta ]) - input[1] = Channel.of([ meta, gtf ]) - input[2] = Channel.of([[],[]]) - + input[0] = Channel.of([ meta, genome_fasta, gtf ]) + input[1] = Channel.of([[],[]]) """ } } @@ -29,12 +28,19 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot( - path("${process.out.index[0][1]}/ctg_offsets.bin"), - path("${process.out.index[0][1]}/duplicate_clusters.tsv"), - path("${process.out.index[0][1]}/mphf.bin"), - process.out.versions) - .match() } + { assert snapshot(process.out.versions).match() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() }, + { assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/roers_ref.fa").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/t2g_3col.tsv").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/gene_id_to_name.tsv").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/roers_make-ref.json").exists() }, + { assert file("${process.out.t2g.get(0).get(1)}").exists() }, ) } @@ -48,9 +54,8 @@ nextflow_process { transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true) meta = [ 'id': 'human_transcriptome'] - input[0] = Channel.of([[],[]]) - input[1] = Channel.of([[],[]]) - input[2] = Channel.of([ meta, transcriptome_fasta ]) + input[0] = Channel.of([[],[],[]]) + input[1] = Channel.of([ meta, transcriptome_fasta ]) """ } } @@ -58,12 +63,20 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot( - path("${process.out.index[0][1]}/ctg_offsets.bin"), - path("${process.out.index[0][1]}/duplicate_clusters.tsv"), - path("${process.out.index[0][1]}/mphf.bin"), - process.out.versions) - .match() } + { assert snapshot(process.out.versions).match() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() }, + { assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() } + // { assert snapshot( + // path("${process.out.index.get(0).get(1)}/piscem_idx.ctab"), + // path("${process.out.index.get(0).get(1)}/piscem_idx.json"), + // path("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json"), + // process.out.versions) + // .match() } ) } } @@ -76,9 +89,8 @@ nextflow_process { transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true) meta = [ 'id': 'human_transcriptome'] - input[0] = Channel.of([[],[]]) - input[1] = Channel.of([[],[]]) - input[2] = Channel.of([ meta, transcriptome_fasta ]) + input[0] = Channel.of([[],[],[]]) + input[1] = Channel.of([ meta, transcriptome_fasta ]) """ } } @@ -90,5 +102,4 @@ nextflow_process { ) } } - } \ No newline at end of file diff --git a/modules/nf-core/simpleaf/index/tests/main.nf.test.snap b/modules/nf-core/simpleaf/index/tests/main.nf.test.snap index f8239fedbe0..5725a9356a5 100644 --- a/modules/nf-core/simpleaf/index/tests/main.nf.test.snap +++ b/modules/nf-core/simpleaf/index/tests/main.nf.test.snap @@ -1,14 +1,15 @@ { "Homo sapiens - transcriptome index - direct - transcriptome fasta": { "content": [ - "ctg_offsets.bin:md5,3d2ad5b4f1aea940a1d3864b9db19fa0", - "duplicate_clusters.tsv:md5,c96ca031de4888558eec24fd13bd1c9b", - "mphf.bin:md5,48234131012798a528048d48881c1ce2", [ - "versions.yml:md5,47601b4a8da5a40635a86b0ed8629a74" + "versions.yml:md5,bd96efe900339c637533c40b37fa5cfc" ] ], - "timestamp": "2024-01-24T21:21:27.842730909" + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T00:40:55.088252924" }, "Homo sapiens - transcriptome index - direct - transcriptome fasta - stub": { "content": [ @@ -19,9 +20,9 @@ ], [ - "ctg_offsets.bin:md5,d41d8cd98f00b204e9800998ecf8427e", - "duplicate_clusters.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "mphf.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + "piscem_idx.ectab:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx.sshash:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx_cfish.json:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], @@ -30,7 +31,10 @@ [ ], - "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + [ + "roers_ref.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ] ], "2": [ @@ -38,20 +42,11 @@ [ ], - [ - [ - "ctg_offsets.bin:md5,d41d8cd98f00b204e9800998ecf8427e", - "duplicate_clusters.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "mphf.bin:md5,d41d8cd98f00b204e9800998ecf8427e" - ], - [ - "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] + "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "3": [ - "versions.yml:md5,26b5417a172514be292f0ea0e0e55830" + "versions.yml:md5,78f7da1109cf98d7b9107222704848e1" ], "index": [ [ @@ -59,30 +54,24 @@ ], [ - "ctg_offsets.bin:md5,d41d8cd98f00b204e9800998ecf8427e", - "duplicate_clusters.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "mphf.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + "piscem_idx.ectab:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx.sshash:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx_cfish.json:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], - "salmon": [ + "ref": [ [ [ ], [ - [ - "ctg_offsets.bin:md5,d41d8cd98f00b204e9800998ecf8427e", - "duplicate_clusters.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "mphf.bin:md5,d41d8cd98f00b204e9800998ecf8427e" - ], - [ - "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "roers_ref.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], - "transcript_tsv": [ + "t2g": [ [ [ @@ -91,21 +80,26 @@ ] ], "versions": [ - "versions.yml:md5,26b5417a172514be292f0ea0e0e55830" + "versions.yml:md5,78f7da1109cf98d7b9107222704848e1" ] } ], - "timestamp": "2024-01-24T21:21:38.650086761" + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T02:08:51.588975264" }, "Homo sapiens - genome index - expanded - fasta + gtf": { "content": [ - "ctg_offsets.bin:md5,c37313b499eb0dc580d962b82ac63f9e", - "duplicate_clusters.tsv:md5,c96ca031de4888558eec24fd13bd1c9b", - "mphf.bin:md5,c7ae1b883f0987fedc8bb61e139136a7", [ - "versions.yml:md5,47601b4a8da5a40635a86b0ed8629a74" + "versions.yml:md5,bd96efe900339c637533c40b37fa5cfc" ] ], - "timestamp": "2024-01-24T21:21:05.595452412" + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T00:40:41.692166586" } } \ No newline at end of file diff --git a/modules/nf-core/simpleaf/quant/environment.yml b/modules/nf-core/simpleaf/quant/environment.yml index 2a6838c2bbd..7e7a1020431 100644 --- a/modules/nf-core/simpleaf/quant/environment.yml +++ b/modules/nf-core/simpleaf/quant/environment.yml @@ -1,8 +1,9 @@ channels: - - conda-forge - bioconda + - conda-forge dependencies: - - bioconda::alevin-fry=0.8.2 - - bioconda::salmon=1.10.2 - - bioconda::simpleaf=0.15.1 + - bioconda::alevin-fry=0.11.1 + - bioconda::piscem=0.11.0 + - bioconda::salmon=1.10.3 + - bioconda::simpleaf=0.18.4 diff --git a/modules/nf-core/simpleaf/quant/main.nf b/modules/nf-core/simpleaf/quant/main.nf index 3dede7ea9a2..818f514b53d 100644 --- a/modules/nf-core/simpleaf/quant/main.nf +++ b/modules/nf-core/simpleaf/quant/main.nf @@ -1,42 +1,45 @@ process SIMPLEAF_QUANT { - tag "$meta.id" + tag "${meta.id ?: meta4.id}" label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/simpleaf:0.15.1--h4ac6f70_0': - 'biocontainers/simpleaf:0.15.1--h4ac6f70_0' }" + 'https://depot.galaxyproject.org/singularity/simpleaf:0.18.4--ha6fb395_1': + 'biocontainers/simpleaf:0.18.4--ha6fb395_1' }" input: // // Input reads are expected to come as: [ meta, [ pair1_read1, pair1_read2, pair2_read1, pair2_read2 ] ] // Input array for a sample is created in the same order reads appear in samplesheet as pairs from replicates are appended to array. // - tuple val(meta), val(chemistry), path(reads) - tuple val(meta2), path(index) - tuple val(meta3), path(txp2gene) - val resolution - tuple val(meta4), path(whitelist) + tuple val(meta), val(chemistry), path(reads) // chemistry and reads + tuple val(meta2), path(index), path(txp2gene) // index and t2g mapping + tuple val(meta3), val(cell_filter), val(number_cb), path(cb_list) // cell filtering strategy + val resolution // UMI resolution + tuple val(meta4), path(map_dir) // mapping results output: - tuple val(meta), path("${prefix}"), emit: results - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}/af_map") , emit: map, optional: true // missing if map_dir is provided + tuple val(meta), path("${prefix}/af_quant") , emit: quant + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def args_list = args.tokenize() prefix = task.ext.prefix ?: "${meta.id}" - unfiltered_command = "" - if (whitelist) { - unfiltered_command = "-u <(gzip -dcf ${whitelist})" - } + // The first required input is either a mapping result directory, or the reads and index files for mapping. + mapping_args = mappingArgs(chemistry, reads, index, txp2gene, map_dir) + + // The second required input is a cell filtering strategy. + cf_option = cellFilteringArgs(cell_filter, number_cb, cb_list) + + meta = map_dir ? meta4 : meta2 + meta3 + meta + meta = meta + [ "filtered": cell_filter != "unfiltered-pl" ] // separate forward from reverse pairs - def (forward, reverse) = reads.collate(2).transpose() """ # export required var export ALEVIN_FRY_HOME=. @@ -46,43 +49,110 @@ process SIMPLEAF_QUANT { # run simpleaf quant simpleaf quant \\ - -i ${index} \\ - -1 ${forward.join( "," )} \\ - -2 ${reverse.join( "," )} \\ - -c $chemistry \\ - -r $resolution \\ - -o ${prefix} \\ - -t $task.cpus \\ - -m $txp2gene \\ - $unfiltered_command \\ - $args - - [[ ! -f ${prefix}/af_quant/all_freq.bin ]] && cp ${prefix}/af_quant/permit_freq.bin ${prefix}/af_quant/all_freq.bin + $mapping_args \\ + --resolution ${resolution} \\ + --output ${prefix} \\ + --threads ${task.cpus} \\ + ${cf_option} \\ + ${args} cat <<-END_VERSIONS > versions.yml "${task.process}": - simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") END_VERSIONS """ stub: prefix = task.ext.prefix ?: "${meta.id}" """ + export ALEVIN_FRY_HOME=. + mkdir -p ${prefix}/af_map mkdir -p ${prefix}/af_quant/alevin touch ${prefix}/af_map/map.rad touch ${prefix}/af_map/unmapped_bc_count.bin touch ${prefix}/af_quant/alevin/quants_mat_rows.txt - touch ${prefix}/af_quant/all_freq.bin touch ${prefix}/af_quant/map.collated.rad touch ${prefix}/af_quant/permit_freq.bin cat <<-END_VERSIONS > versions.yml "${task.process}": - simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") END_VERSIONS """ } + +// We have mutual exclusive options for permit list generation. +// 1. 'k' (knee), which is a flag for the knee method and any value provided will be ignored; +// 2. 'f' (forced-cells), which takes an integer indicating the exact number of cells to recover; +// 3. 'e' (expect-cells), which takes an integer indicating the expected number of cells to recover; +// 4. 'x' (explicit-pl), which takes a string indicating the path to a valid permit list; +// 5. 'u' (unfiltered-pl), which takes an empty string (if `chemistry` is defined as "10xv2" or "10xv3"), or a string indicating the path to a valid white list file. +// The difference between (4) and (5) is that (4) contains the exact permit list to filter the observed barcodes, while (5) will use the white list to generate a permit list via barcode correction. + +// We have two ways to take these options. `-u` is implied by the presence of the input `whitelist` channel. The options can also be passed as arguments to ext.args. Therefore, we must check two things: +// 1. if there is at least one of the options in the args list, and +// 2. if none of the four options are in the args list, there must be a non-empty whitelist channel. + +def cellFilteringArgs(cell_filter_method, number_cb, cb_list) { + def pl_options = ["knee", "forced-cells", "explicit-pl", "expect-cells", "unfiltered-pl"] + + // try catch unintentional underscore in method name + def method = cell_filter_method.replaceAll('_','-') + + def number = number_cb + if (!method) { + error "No cell filtering method was provided; cannot proceed." + } else if (! method in pl_options) { + error "Invalid cell filtering method, '${method}', was provided; cannot proceed. possible options are ${pl_options.join(',')}." + } + + if (method == "unfiltered-pl") { + return "--${method} ${cb_list}" + } else if (method == "explicit-pl") { + return "--${method} ${cb_list}" + } else if (method == "knee") { + return "--${method}" + } else { + if (!number) { + error "Could not find the corresponding 'number' field for the cell filtering method '${method}'; please use the following format: [method:'${method}',number:3000]." + } + return "--${method} ${number}" + } +} + +def mappingArgs(chemistry, reads, index, txp2gene, map_dir) { + if ( map_dir ) { + if (reads) { + error "Found both reads and map_dir. Please provide only one of the two." + } + return "--map-dir ${map_dir}" + } else { + if (!reads) { + error "Missing read files; could not proceed." + } + if (!index) { + error "Missing index files; could not proceed." + } + if (!chemistry) { + error "Missing chemistry; could not proceed." + } + + def (forward, reverse) = reads.collate(2).transpose() + + def t2g = txp2gene ? "--t2g-map ${txp2gene}" : "" + def mapping_args = """${t2g} \\ + --chemistry ${chemistry} \\ + --index ${index} \\ + --reads1 ${forward.join( "," )} \\ + --reads2 ${reverse.join( "," )}""" + return mapping_args + } +} diff --git a/modules/nf-core/simpleaf/quant/meta.yml b/modules/nf-core/simpleaf/quant/meta.yml index 79d7b158e2b..7d51dc8c612 100644 --- a/modules/nf-core/simpleaf/quant/meta.yml +++ b/modules/nf-core/simpleaf/quant/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: simpleaf_quant description: simpleaf is a program to simplify and customize the running and configuration of single-cell processing with alevin-fry. @@ -9,7 +8,7 @@ keywords: tools: - simpleaf: description: | - SimpleAF is a tool for quantification of gene expression from RNA-seq data + SimpleAF is a program to simplify and customize the running and configuration of single-cell processing with alevin-fry. homepage: https://github.com/COMBINE-lab/simpleaf licence: ["BSD-3-Clause"] identifier: "" @@ -22,56 +21,82 @@ input: - chemistry: type: string description: | - Chemistry used for library preparation. It can be a string describing - the specific chemistry or the geometry of the barcode, UMI, and - mappable read. For example, "10xv2" and "10xv3" will apply the - appropriate settings for 10x Chromium v2 and v3 protocols, - respectively. Alternatively, you can provide a general geometry string - if your chemistry is not pre-registered. For example, instead of - "10xv2", you could use "1{b[16]u[10]x:}2{r:}", or instead of "10xv3", - you could use "1{b[16]u[12]x:}2{r:}". + Chemistry used for library preparation. It can be a string describing the specific chemistry or the geometry of the barcode, UMI, and mappable read. For example, "10xv2" and "10xv3" will apply the appropriate settings for 10x Chromium v2 and v3 protocols, respectively. Alternatively, you can provide a general geometry string if your chemistry is not pre-registered. For example, instead of "10xv2", you could use "1{b[16]u[10]x:}2{r:}", or instead of "10xv3", you could use "1{b[16]u[12]x:}2{r:}". Details at https://hackmd.io/@PI7Og0l1ReeBZu_pjQGUQQ/rJMgmvr13 - reads: type: file description: | List of input FastQ files for paired-end data. Reads should be grouped by pairs. + For example, [ [R1_1.fastq.gz, R2_1.fastq.gz], [R1_2.fastq.gz, R2_2.fastq.gz] ] - - meta2: type: map description: | Groovy Map containing index information + e.g. [ tool:'piscem' ] - index: type: directory - description: Folder containing the index files + description: Folder containing the index files. For a *salmon* index that is + not generated by simpleaf to be taken, '--no-piscem' MUST be specified in + ext.args. + - txp2gene: + type: file + description: | + File mapping transcripts to genes. It can be either a two-column TSV file for a standard transcriptomic index containing the transcript-to-gene ID mapping information, or a three-column TSV file for an augmented transcriptomic index with the third column representing the splicing status of each transcript. - - meta3: type: map description: | Groovy Map containing txp2gene information - - txp2gene: + e.g. [ mode:'usa' ] + - cell_filter: + type: string + enum: ["knee", "forced-cells", "explicit-pl", "expect-cells", "unfiltered-pl"] + description: | + Cell filtering mode. Possible values are 'usa' and 'whitelist'. 'usa' will use the default cell filtering mode, while 'whitelist' will use the whitelist file provided in the 'whitelist' input. + - number_cb: + type: integer + description: | + Number of cell barcodes to use for cell filtering. Set as empty ('[]') unless 'cell_filter' is set to 'forced-cells' or 'expect-cells'. + - cb_list: type: file description: | - File mapping transcripts to genes. + File containing a list of cell barcodes to use for cell filtering. Set as empty ('[]') unless 'cell_filter' is set to 'unfiltered-pl' or 'explicit-pl'. - - resolution: type: string description: | - Resolution for the clustering. + UMI resolution (https://alevin-fry.readthedocs.io/en/latest/quant.html). Possible values are 'cr-like', 'cr-like-em', 'parsimony', 'parsimony-em', 'parsimony-gene', and 'parsimony-gene-em'. - - meta4: type: map description: | - Groovy Map containing whitelist information - - whitelist: - type: file - description: | - Whitelist file containing valid cell barcodes. Optional. + Groovy Map containing existing mapping results. + e.g. [ tool:'piscem' ] + - map_dir: + type: directory + description: Folder containing the existing mapping results. It must be generated + by simpleaf or alevin-fry, and contain the mapping file named map.rad. output: - - results: + - map: - meta: type: map description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - ${prefix}: - type: directory - description: Folder containing the quantification results + pattern: "simpleaf/af_map" + - ${prefix}/af_map: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - quant: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/af_quant: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] - versions: - versions.yml: type: file @@ -83,9 +108,11 @@ authors: - "@Khajidu" - "@apeltzer" - "@pinin4fjords" + - "@dongzehe" maintainers: - "@fmalmeida" - "@maxulysse" - "@Khajidu" - "@apeltzer" - "@pinin4fjords" + - "@dongzehe" diff --git a/modules/nf-core/simpleaf/quant/tests/main.nf.test b/modules/nf-core/simpleaf/quant/tests/main.nf.test index 72e3ef7ce1f..f4cdfc1edda 100644 --- a/modules/nf-core/simpleaf/quant/tests/main.nf.test +++ b/modules/nf-core/simpleaf/quant/tests/main.nf.test @@ -21,9 +21,8 @@ nextflow_process { gtf = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) meta = [ 'id': 'human'] - input[0] = Channel.of([meta, genome_fasta]) - input[1] = Channel.of([meta, gtf]) - input[2] = Channel.of([[],[]]) + input[0] = Channel.of([meta, genome_fasta, gtf]) + input[1] = Channel.of([[],[]]) """ } } @@ -31,7 +30,7 @@ nextflow_process { test("test_simpleaf_quant") { when { - config "./nextflow.config" + // config "./nextflow.config" process { """ meta = [id:'test_10x', single_end:false, strandedness:'auto'] @@ -40,8 +39,8 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/cellranger/5k_cmvpos_tcells/fastqs/gex_1/subsampled_5k_human_antiCMV_T_TBNK_connect_GEX_1_S1_L001_R2_001.fastq.gz', checkIfExists: true) ] input[0] = Channel.of([meta, '10xv3', files]) - input[1] = SIMPLEAF_INDEX.out.index - input[2] = SIMPLEAF_INDEX.out.transcript_tsv + input[1] = SIMPLEAF_INDEX.out.index.combine(SIMPLEAF_INDEX.out.t2g, by: 0) + input[2] = [[],"knee",[],[]] input[3] = Channel.of('cr-like') input[4] = Channel.of([[],[]]) """ @@ -51,16 +50,27 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot( - process.out.versions, - path("${process.out.results[0][1]}/af_map/map.rad"), - path("${process.out.results[0][1]}/af_map/unmapped_bc_count.bin"), - path("${process.out.results[0][1]}/af_quant/alevin/quants_mat_rows.txt"), - path("${process.out.results[0][1]}/af_quant/alevin/quants_mat_rows.txt"), - path("${process.out.results[0][1]}/af_quant/all_freq.bin"), - path("${process.out.results[0][1]}/af_quant/map.collated.rad"), - path("${process.out.results[0][1]}/af_quant/permit_freq.bin")) - .match() } + { assert snapshot(process.out.versions).match() }, + { assert file("${process.out.map.get(0).get(1)}/map.rad").exists() }, + { assert file("${process.out.map.get(0).get(1)}/map_info.json").exists() }, + { assert file("${process.out.map.get(0).get(1)}/unmapped_bc_count.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/permit_map.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/collate.json").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/generate_permit_list.json").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/quant.json").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/featureDump.txt").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/permit_freq.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/unmapped_bc_count_collated.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/alevin/quants_mat.mtx").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/alevin/quants_mat_cols.txt").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/alevin/quants_mat_rows.txt").exists() }, + // { assert snapshot( + // process.out.versions, + // path("${process.out.map.get(0).get(1)}/map.rad"), + // path("${process.out.map.get(0).get(1)}/unmapped_bc_count.bin"), + // path("${process.out.quant.get(0).get(1)}/map.collated.rad"), + // path("${process.out.quant.get(0).get(1)}/featureDump.txt")) + // .match() } ) } @@ -68,9 +78,9 @@ nextflow_process { test("test_simpleaf_quant stub") { options "-stub-run" + // config "./nextflow.config" when { - config "./nextflow.config" process { """ meta = [id:'test_10x', single_end:false, strandedness:'auto'] @@ -79,8 +89,8 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/cellranger/5k_cmvpos_tcells/fastqs/gex_1/subsampled_5k_human_antiCMV_T_TBNK_connect_GEX_1_S1_L001_R2_001.fastq.gz', checkIfExists: true) ] input[0] = Channel.of([meta, '10xv3', files]) - input[1] = SIMPLEAF_INDEX.out.index - input[2] = SIMPLEAF_INDEX.out.transcript_tsv + input[1] = SIMPLEAF_INDEX.out.index.combine(SIMPLEAF_INDEX.out.t2g, by: 0) + input[2] = [[],"knee",[],[]] input[3] = Channel.of('cr-like') input[4] = Channel.of([[],[]]) """ @@ -98,4 +108,3 @@ nextflow_process { } - diff --git a/modules/nf-core/simpleaf/quant/tests/main.nf.test.snap b/modules/nf-core/simpleaf/quant/tests/main.nf.test.snap index b15e6c24a22..874b8151bfa 100644 --- a/modules/nf-core/simpleaf/quant/tests/main.nf.test.snap +++ b/modules/nf-core/simpleaf/quant/tests/main.nf.test.snap @@ -3,6 +3,19 @@ "content": [ { "0": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + "map.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "unmapped_bc_count.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ [ { "id": "test_10x", @@ -11,24 +24,30 @@ }, [ [ - "map.rad:md5,d41d8cd98f00b204e9800998ecf8427e", - "unmapped_bc_count.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + "quants_mat_rows.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], - [ - [ - "quants_mat_rows.txt:md5,d41d8cd98f00b204e9800998ecf8427e" - ], - "all_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e", - "map.collated.rad:md5,d41d8cd98f00b204e9800998ecf8427e", - "permit_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "map.collated.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "permit_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], - "1": [ - "versions.yml:md5,f4407c70f91d116f0770585e6af92e99" + "2": [ + "versions.yml:md5,c9a934ed7c246bef3ccccab002db043b" + ], + "map": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + "map.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "unmapped_bc_count.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] ], - "results": [ + "quant": [ [ { "id": "test_10x", @@ -37,40 +56,34 @@ }, [ [ - "map.rad:md5,d41d8cd98f00b204e9800998ecf8427e", - "unmapped_bc_count.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + "quants_mat_rows.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ], - [ - [ - "quants_mat_rows.txt:md5,d41d8cd98f00b204e9800998ecf8427e" - ], - "all_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e", - "map.collated.rad:md5,d41d8cd98f00b204e9800998ecf8427e", - "permit_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "map.collated.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "permit_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], "versions": [ - "versions.yml:md5,f4407c70f91d116f0770585e6af92e99" + "versions.yml:md5,c9a934ed7c246bef3ccccab002db043b" ] } ], - "timestamp": "2024-01-24T21:22:12.652834351" + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T02:00:35.55447474" }, "test_simpleaf_quant": { "content": [ [ - "versions.yml:md5,07a8792421448df822587a135097579d" - ], - "map.rad:md5,6a00620e75874acd89f62891803c140c", - "unmapped_bc_count.bin:md5,7d0f401573b121914df1ef036405187c", - "quants_mat_rows.txt:md5,78e92f0584cc4132374ea7f8fcc1bf1f", - "quants_mat_rows.txt:md5,78e92f0584cc4132374ea7f8fcc1bf1f", - "all_freq.bin:md5,ff6a60def164baabaecc05e10b4ac397", - "map.collated.rad:md5,6517d50f1ccd83720dd9c667adac0f2f", - "permit_freq.bin:md5,bfddd006392e272c24849861597c34b4" + "versions.yml:md5,c9a934ed7c246bef3ccccab002db043b" + ] ], - "timestamp": "2024-01-24T21:21:59.445286096" + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T02:00:28.925349117" } } \ No newline at end of file diff --git a/modules/nf-core/simpleaf/quant/tests/nextflow.config b/modules/nf-core/simpleaf/quant/tests/nextflow.config deleted file mode 100644 index 7073420ca0e..00000000000 --- a/modules/nf-core/simpleaf/quant/tests/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: 'SIMPLEAF_QUANT' { - ext.args = { "--knee" } - } -}