Skip to content

Commit

Permalink
Merge branch 'CW-3318_publish_transcriptome_files' into 'dev'
Browse files Browse the repository at this point in the history
Publish transcriptome files

Closes CW-3318

See merge request epi2melabs/workflows/wf-single-cell!145
  • Loading branch information
nrhorner committed Feb 6, 2024
2 parents af379d5 + d7b2f85 commit 890cab0
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 16 deletions.
4 changes: 2 additions & 2 deletions .github/ISSUE_TEMPLATE/bug_report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ body:
label: Were you able to successfully run the latest version of the workflow with the demo data?
description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
options:
- yes
- no
- 'yes'
- 'no'
- other (please describe below)
validations:
required: true
Expand Down
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [v1.0.3]
### Added
- Publish stringtie transcriptome fasta and GFF files to output dir.
### Fixed
- More informative error message upon read duplicate detection.
### Updated
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ Output files may be aggregated including information for all samples or provided
| Alignment index per chromosome | ./{{ alias }}/bams/{{ alias }}.{{ chromosome }}.tagged.bam.bai | Genomic alignment index file per chromosome. | per-sample |
| Alignment output per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam | Genomic alignment output file with aggregated chromosomes (when using --merge_bam). | per-sample |
| Alignment index per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam.bai | Genomic alignment index file with aggregated chromosomes (when using --merge_bam). | per-sample |
| Transcriptome sequence | ./{{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample |
| Transcriptome annotation | ./{{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample |



Expand Down
2 changes: 2 additions & 0 deletions docs/07_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ Output files may be aggregated including information for all samples or provided
| Alignment index per chromosome | ./{{ alias }}/bams/{{ alias }}.{{ chromosome }}.tagged.bam.bai | Genomic alignment index file per chromosome. | per-sample |
| Alignment output per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam | Genomic alignment output file with aggregated chromosomes (when using --merge_bam). | per-sample |
| Alignment index per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam.bai | Genomic alignment index file with aggregated chromosomes (when using --merge_bam). | per-sample |
| Transcriptome sequence | ./{{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample |
| Transcriptome annotation | ./{{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample |
2 changes: 1 addition & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ process output {
publishDir "${params.out_dir}", mode: 'copy', pattern: "*umap*.{tsv,png}",
saveAs: { filename -> "${meta.alias}/umap/$filename" }
publishDir "${params.out_dir}", mode: 'copy',
pattern: "*{images,counts,gene_expression,transcript_expression,kneeplot,saturation,config,tags,whitelist}*",
pattern: "*{images,counts,gene_expression,transcript_expression,kneeplot,saturation,config,tags,whitelist,transcriptome,annotation}*",
saveAs: { filename -> "${meta.alias}/$filename" }

input:
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ manifest {
description = 'Identification of cell- and UMI barcodes from single-cell sequencing.'
mainScript = 'main.nf'
nextflowVersion = '>=23.04.2'
version = '1.0.2'
version = '1.0.3'
}

epi2melabs {
Expand Down
16 changes: 16 additions & 0 deletions output_definition.json
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,22 @@
"mime-type": "application/gzip",
"optional": true,
"type": "per-sample"
},
"transcriptome_fasta": {
"filepath": "./{{ alias }}/{{ alias }}.transcriptome.fa.gz",
"title": "Transcriptome sequence",
"description": "Transcriptome generated by Stringtie during transcript discovery stage",
"mime-type": "application/gzip",
"optional": false,
"type": "per-sample"
},
"transcriptome_annotation": {
"filepath": "./{{ alias }}/{{ alias }}.transcriptome.gff.gz",
"title": "Transcriptome annotation",
"description": "Transcriptome annotation generated by Stringtie during transcript discovery stage",
"mime-type": "application/gzip",
"optional": false,
"type": "per-sample"
}
}
}
55 changes: 44 additions & 11 deletions subworkflows/process_bams.nf
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,6 @@ process combine_uncorrect_bcs {
}



process combine_chrom_bams {
// Merge all chromosome bams by sample_id
label "singlecell"
Expand Down Expand Up @@ -284,30 +283,32 @@ process stringtie {
output:
tuple val(meta),
val(chr),
path("transcriptome.fa"),
path("${meta.alias}.transcriptome.fa"),
path("chr.gtf"),
path("stringtie.gff"),
path("${meta.alias}.stringtie.gff"),
path("reads.fastq"),
emit: read_tr_map
script:
if (meta.kit_name=="5prime")
"""
# Add chromosome label (-l) to generated transcripts
# so we don't get name collisions during file merge later
samtools view -h align.bam ${chr} \
| tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l stringtie \
-o stringtie.gff - ) \
| samtools fastq > reads.fastq
| tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l "${chr}.stringtie" \
-o "${meta.alias}.stringtie.gff" - ) \
| samtools fastq > reads.fastq
# Get transcriptome sequence
gffread -g ref_genome.fa -w transcriptome.fa stringtie.gff
gffread -g ref_genome.fa -w "${meta.alias}.transcriptome.fa" "${meta.alias}.stringtie.gff"
"""
else
"""
"""
# Data from 3prime and multiome kits must be flipped to the transcript strand before building transcriptome.
workflow-glue process_bam_for_stringtie align.bam ${chr} \
| tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l stringtie \
-o stringtie.gff - ) \
| tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l "${chr}.stringtie" \
-o "${meta.alias}.stringtie.gff" - ) \
| samtools fastq > reads.fastq
# Get transcriptome sequence
gffread -g ref_genome.fa -w transcriptome.fa stringtie.gff
gffread -g ref_genome.fa -w "${meta.alias}.transcriptome.fa" "${meta.alias}.stringtie.gff"
"""
}

Expand Down Expand Up @@ -371,6 +372,9 @@ process assign_features {
val(chr),
path("${meta.alias}.${chr}.feature_assigns.tsv"),
emit: feature_assigns
tuple val(meta),
path("gffcompare.annotated.gtf"),
emit: annotation
"""
# gffcomapre maps transcript reference IDs to query transcripts.
gffcompare -o gffcompare -r chr.gtf stringtie.gff
Expand Down Expand Up @@ -475,6 +479,27 @@ process umap_reduce_expression_matrix {
"""
}

process merge_transcriptome {
// Merge the annotated GFFs and transcriptome sequence files
label "singlecell"
cpus 1
memory "2GB"
input:
tuple val(meta),
path('fasta/?.fa'),
path('gffs/?.gff')
output:
tuple val(meta),
path("${meta.alias}.transcriptome.gff.gz"),
path("${meta.alias}.transcriptome.fa.gz"),
emit: merged_annotation
"""
# Concatenate transcriptome files, remove comments (from gff) and compress
find fasta/ -name '*.fa' -exec cat {} + | gzip > "${meta.alias}.transcriptome.fa.gz"
find gffs/ -name '*.gff' -exec cat {} + |grep -v '^#' | gzip > "${meta.alias}.transcriptome.gff.gz"
"""
}


process pack_images {
label "singlecell"
Expand Down Expand Up @@ -618,6 +643,13 @@ workflow process_bams {
.concat(umi_gene_saturation.out.saturation_curve)
.groupTuple())

merge_transcriptome(
assign_features.out.annotation.groupTuple()
.join(stringtie.out.read_tr_map.groupTuple())
.map{
meta, ann_tr_gff, chr, tr_fa, ref_gtf, str_gff, fastq ->
[meta, tr_fa, ann_tr_gff]})

// Tidy up channels prior to output
proc_expresion_out = process_expression_matrix.out.gene_matrix_processed_tsv
.concat(process_expression_matrix.out.transcript_matrix_processed_tsv)
Expand All @@ -635,6 +667,7 @@ workflow process_bams {
.join(tagged_bams)
.join(combine_uncorrect_bcs.out)
.join(pack_images.out)
.join(merge_transcriptome.out)
.map{it -> it.flatten()}

// Emit sperately for use in the report
Expand Down

0 comments on commit 890cab0

Please sign in to comment.