Merge branch 'CW-3318_publish_transcriptome_files' into 'dev'

Publish transcriptome files Closes CW-3318 See merge request epi2melabs/workflows/wf-single-cell!145
epi2me-labs · Feb 6, 2024 · 890cab0 · 890cab0
2 parents af379d5 + d7b2f85
commit 890cab0
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 16 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -128,8 +128,8 @@ body:
       label: Were you able to successfully run the latest version of the workflow with the demo data?
       description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
       options:
-        - yes
-        - no
+        - 'yes'
+        - 'no'
         - other (please describe below)
     validations:
       required: true

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [v1.0.3]
+### Added
+- Publish stringtie transcriptome fasta and GFF files to output dir.
 ### Fixed
 - More informative error message upon read duplicate detection.
 ### Updated

diff --git a/README.md b/README.md
@@ -214,6 +214,8 @@ Output files may be aggregated including information for all samples or provided
 | Alignment index per chromosome | ./{{ alias }}/bams/{{ alias }}.{{ chromosome }}.tagged.bam.bai | Genomic alignment index file per chromosome. | per-sample |
 | Alignment output per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam | Genomic alignment output file with aggregated chromosomes (when using --merge_bam). | per-sample |
 | Alignment index per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam.bai | Genomic alignment index file with aggregated chromosomes (when using --merge_bam). | per-sample |
+| Transcriptome sequence | ./{{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample |
+| Transcriptome annotation | ./{{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample |
 
 
 

diff --git a/docs/07_outputs.md b/docs/07_outputs.md
@@ -19,3 +19,5 @@ Output files may be aggregated including information for all samples or provided
 | Alignment index per chromosome | ./{{ alias }}/bams/{{ alias }}.{{ chromosome }}.tagged.bam.bai | Genomic alignment index file per chromosome. | per-sample |
 | Alignment output per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam | Genomic alignment output file with aggregated chromosomes (when using --merge_bam). | per-sample |
 | Alignment index per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam.bai | Genomic alignment index file with aggregated chromosomes (when using --merge_bam). | per-sample |
+| Transcriptome sequence | ./{{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample |
+| Transcriptome annotation | ./{{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample |
diff --git a/main.nf b/main.nf
@@ -114,7 +114,7 @@ process output {
     publishDir "${params.out_dir}", mode: 'copy', pattern: "*umap*.{tsv,png}",
         saveAs: { filename -> "${meta.alias}/umap/$filename" }
     publishDir "${params.out_dir}", mode: 'copy', 
-        pattern: "*{images,counts,gene_expression,transcript_expression,kneeplot,saturation,config,tags,whitelist}*",
+        pattern: "*{images,counts,gene_expression,transcript_expression,kneeplot,saturation,config,tags,whitelist,transcriptome,annotation}*",
         saveAs: { filename -> "${meta.alias}/$filename" }
 
     input:

diff --git a/nextflow.config b/nextflow.config
@@ -75,7 +75,7 @@ manifest {
     description     = 'Identification of cell- and UMI barcodes from single-cell sequencing.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = '1.0.2'
+    version         = '1.0.3'
 }
 
 epi2melabs {

diff --git a/output_definition.json b/output_definition.json
@@ -135,6 +135,22 @@
       "mime-type": "application/gzip",
       "optional": true,
       "type": "per-sample"
+    },
+    "transcriptome_fasta": {
+      "filepath": "./{{ alias }}/{{ alias }}.transcriptome.fa.gz",
+      "title": "Transcriptome sequence",
+      "description": "Transcriptome generated by Stringtie during transcript discovery stage",
+      "mime-type": "application/gzip",
+      "optional": false,
+      "type": "per-sample"
+    },
+    "transcriptome_annotation": {
+      "filepath": "./{{ alias }}/{{ alias }}.transcriptome.gff.gz",
+      "title": "Transcriptome annotation",
+      "description": "Transcriptome annotation generated by Stringtie during transcript discovery stage",
+      "mime-type": "application/gzip",
+      "optional": false,
+      "type": "per-sample"
     }
   }
 }
diff --git a/subworkflows/process_bams.nf b/subworkflows/process_bams.nf
@@ -244,7 +244,6 @@ process combine_uncorrect_bcs {
 }
 
 
-
 process combine_chrom_bams {
     // Merge all chromosome bams by sample_id
     label "singlecell"
@@ -284,30 +283,32 @@ process stringtie {
     output:
         tuple val(meta),
               val(chr),
-              path("transcriptome.fa"),
+              path("${meta.alias}.transcriptome.fa"),
               path("chr.gtf"),
-              path("stringtie.gff"),
+              path("${meta.alias}.stringtie.gff"),
               path("reads.fastq"),
               emit: read_tr_map
     script:
     if (meta.kit_name=="5prime")
     """
+    # Add chromosome label (-l) to generated transcripts 
+    # so we don't get name collisions during file merge later 
     samtools view -h align.bam ${chr}  \
-         | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l stringtie \
-             -o stringtie.gff - ) \
-         | samtools fastq > reads.fastq    
+         | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l "${chr}.stringtie" \
+             -o "${meta.alias}.stringtie.gff" - ) \
+         | samtools fastq > reads.fastq
     # Get transcriptome sequence
-    gffread -g ref_genome.fa -w transcriptome.fa stringtie.gff
+    gffread -g ref_genome.fa -w "${meta.alias}.transcriptome.fa" "${meta.alias}.stringtie.gff"
     """
     else
-    """
+    """ 
     # Data from 3prime and multiome kits must be flipped to the transcript strand before building transcriptome.
     workflow-glue process_bam_for_stringtie align.bam ${chr}  \
-        | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l stringtie \
-            -o stringtie.gff - ) \
+        | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l "${chr}.stringtie" \
+            -o "${meta.alias}.stringtie.gff" - ) \
         | samtools fastq > reads.fastq
     # Get transcriptome sequence
-    gffread -g ref_genome.fa -w transcriptome.fa stringtie.gff
+    gffread -g ref_genome.fa -w "${meta.alias}.transcriptome.fa" "${meta.alias}.stringtie.gff"
     """
 }
 
@@ -371,6 +372,9 @@ process assign_features {
               val(chr),
               path("${meta.alias}.${chr}.feature_assigns.tsv"),
               emit: feature_assigns
+        tuple val(meta),
+              path("gffcompare.annotated.gtf"),
+              emit: annotation
     """
     # gffcomapre maps transcript reference IDs to query transcripts.
     gffcompare -o gffcompare -r chr.gtf stringtie.gff
@@ -475,6 +479,27 @@ process umap_reduce_expression_matrix {
     """
 }
 
+process merge_transcriptome {
+    // Merge the annotated GFFs and transcriptome sequence files
+    label "singlecell"
+    cpus 1
+    memory "2GB"
+    input:
+        tuple val(meta),
+            path('fasta/?.fa'),
+            path('gffs/?.gff')
+    output:
+        tuple val(meta),
+            path("${meta.alias}.transcriptome.gff.gz"),
+            path("${meta.alias}.transcriptome.fa.gz"),
+            emit: merged_annotation
+    """
+    # Concatenate transcriptome files, remove comments (from gff) and compress
+    find fasta/ -name '*.fa' -exec cat {} + | gzip > "${meta.alias}.transcriptome.fa.gz"
+    find gffs/ -name '*.gff' -exec cat {} + |grep -v '^#' | gzip > "${meta.alias}.transcriptome.gff.gz"
+    """
+}
+
 
 process pack_images {
     label "singlecell"
@@ -618,6 +643,13 @@ workflow process_bams {
        .concat(umi_gene_saturation.out.saturation_curve)
        .groupTuple())
 
+    merge_transcriptome(
+        assign_features.out.annotation.groupTuple()
+            .join(stringtie.out.read_tr_map.groupTuple())
+            .map{
+                meta, ann_tr_gff, chr, tr_fa, ref_gtf, str_gff, fastq  ->
+                [meta, tr_fa, ann_tr_gff]})
+
     // Tidy up channels prior to output
     proc_expresion_out = process_expression_matrix.out.gene_matrix_processed_tsv
         .concat(process_expression_matrix.out.transcript_matrix_processed_tsv)
@@ -635,6 +667,7 @@ workflow process_bams {
             .join(tagged_bams)
             .join(combine_uncorrect_bcs.out)
             .join(pack_images.out)
+            .join(merge_transcriptome.out)
             .map{it -> it.flatten()}
 
         // Emit sperately for use in the report