Merge branch 'performance_cw1049' into 'dev'

Performance cw1049 See merge request epi2melabs/workflows/wf-single-cell!25
epi2me-labs · Oct 26, 2022 · 3cfbf38 · 3cfbf38
2 parents 1c28807 + a0a6d82
commit 3cfbf38
Show file tree

Hide file tree

Showing 31 changed files with 1,122 additions and 1,837 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.1.4]
+### Fixed
+- Fix transcript matrices not in output folder.
+### Added
+- output of merged bam optional.
+- Repeat umap creation with different random states.
+### Changed 
+- Transcript counting Salmon on stringtie-generated transcriptome.
+- Several perforance-related reforactorings including reductions in read write operations. 
+- single_cell_sample_sheet is optional and kit options can be supplied as workflow parameters.
+
+
 ## [v0.1.3]
 ## Changed
 - Better handling of sample_id conflicts in single_cell_sampkle_sheet and fastgingress.
@@ -13,7 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - kit options can be supplied from command line/config and applied to all samples. 
 
 ## [v0.1.2]
-## Added
+### Added
 - Transcript x cell matrix output.
 
 ## [v0.1.1]

diff --git a/README.md b/README.md
@@ -132,7 +132,7 @@ The most useful outputs of the pipeline are likely:
   - ``n_plus``: number of reads with the read1-->TSO configuration
   - ``n_minus``: number of reads with the TSO'-->read1' configuration
 
-* ``tagged.sorted.bam``: BAM file of alignments to the reference where each alignment contains the following sequence tags
+* ``bams``: Folder of bam alignment files where each alignment contains the following sequence tags
 
   - CB: corrected cell barcode sequence
   - CR: uncorrected cell barcode sequence
@@ -141,13 +141,26 @@ The most useful outputs of the pipeline are likely:
   - UR: uncorrected UMI sequence
   - UY: Phred quality scores of the uncorrected UMI sequence
 
+
+  The bam files are output per chromosome (default) unless `--merge_bam` is set.
+
  * ``gene_expression.processed.tsv``:  TSV containing the gene (rows) x cell (columns) expression matrix, processed and normalized according to: 
 
   - ``matrix_min_genes``: cells with fewer than this number of expressed genes will be removed
   - ``matrix_min_cells``: genes present in fewer than this number of cells will be removed
   - ``matrix_max_mito``: cells with more than this percentage of counts belonging to mitochondrial genes will be removed
   - ``matrix_norm_count``: normalize all cells to this number of total counts per cell
 
+* ``umap``: 
+  This folder contains umap projections and the data file used to generate them.
+  As UMAP is a stochastic algorithm, different runs with using the same parameters can lead
+  to different results. Therefore for each expression results matrix, 10 UMAP matrices and plots are gerenated each with a different initial random state. The following UMAP results are predrnt in this folder:
+  - \*genes\*.png: UMAP derived from the expression of all genes across the cells.
+  - \*gene.{gene_name}\*.png The same plot as above but coulored by gene expression for each gene in the file defined by `umap_plot_genes`.
+  - \*transcripts\*.png UMAP created from expression level of all transcripts.
+  - \*mitochondrial\*.png:  UMAP created from expression level of all mitochondrial genes.
+
+
 
 * ``processed_transcript_matrix.tsv``: TSV containing the transcript (rows) x cell (columns) expression matrix in transcript per million (TPM): 
 These expression values are determined by applying [stringtie](https://ccb.jhu.edu/software/stringtie/) and 

diff --git a/bin/adapter_scan_vsearch.py b/bin/adapter_scan_vsearch.py
@@ -5,7 +5,7 @@
 import logging
 import multiprocessing
 import os
-import pathlib
+from pathlib import Path
 import shutil
 import subprocess
 import sys
@@ -27,22 +27,22 @@ def parse_args():
     parser = argparse.ArgumentParser()
 
     # Positional mandatory arguments
-    parser.add_argument("fastq", help="FASTQ of ONT reads", type=str)
+    parser.add_argument("fastq", help="FASTQ of ONT reads", type=Path)
 
     # Optional arguments
     parser.add_argument(
         "--output_fastq",
         help="Output file name for (gzipped) stranded FASTQ entries \
                         [stranded.fastq.gz]",
-        type=str,
+        type=Path,
         default="stranded.fastq.gz",
     )
 
     parser.add_argument(
         "--output_tsv",
         help="Output file name for adapter configurations \
                         [adapters.tsv]",
-        type=str,
+        type=Path,
         default="adapters.tsv",
     )
 
@@ -66,7 +66,6 @@ def parse_args():
         determines which adapter sequences to search for in the reads \
         [3prime]",
         default="3prime",
-        type=str,
     )
 
     parser.add_argument(
@@ -91,7 +90,7 @@ def parse_args():
         "--adapters_fasta",
         help="Filename for adapter query sequences \
                         [adapter_seqs.fasta]",
-        type=str,
+        type=Path,
         default="adapter_seqs.fasta",
     )
 
@@ -125,7 +124,7 @@ def parse_args():
         args.adapter2_seq = "GTACTCTGCGTTGATACCACTGCTT"
 
     # Create temp dir and add that to the args object
-    p = pathlib.Path(args.output_tsv)
+    p = Path(args.output_tsv)
     tempdir = tempfile.TemporaryDirectory(prefix="tmp.", dir=p.parents[0])
     args.tempdir = tempdir.name
 
@@ -487,10 +486,10 @@ def write_stranded_fastq(tmp_fastq, read_info, args):
 
 def open_fastq(fastq):
     """Open fastq."""
-    if args.fastq.split(".")[-1] == "gz":
+    if fastq.suffix == ".gz":
         f = gzip.open(fastq, "rt")
     else:
-        f = open(fastq)
+        f = fastq.open()
     return f
 
 

diff --git a/bin/add_gene_tags.py b/bin/add_gene_tags.py
@@ -3,6 +3,7 @@
 import argparse
 import logging
 import os
+from pathlib import Path
 
 import numpy as np
 import pysam
@@ -17,22 +18,22 @@ def parse_args():
     parser = argparse.ArgumentParser()
 
     # Positional mandatory arguments
-    parser.add_argument("bam", help="Sorted BAM file", type=str)
+    parser.add_argument("bam", help="Sorted BAM file", type=Path)
 
     parser.add_argument(
         "gene_assigns",
         help="TSV read/gene assignments file. \
         IMPORTANT: reads in the input BAM and gene_assigns file must have the \
         same order.",
-        type=str,
+        type=Path,
     )
 
     # Optional arguments
     parser.add_argument(
         "--output",
         help="Output BAM file containing aligned reads with gene name \
             tags (GN) [gene.sorted.bam]",
-        type=str,
+        type=Path,
         default="gene.sorted.bam",
     )
 
@@ -67,11 +68,11 @@ def get_bam_info(bam):
     :return: Sum of all alignments in the BAM index file and list of all chroms
     :rtype: int,list
     """
-    bam = pysam.AlignmentFile(bam, "rb")
-    stats = bam.get_index_statistics()
-    n_aligns = int(np.sum([contig.mapped for contig in stats]))
-    chroms = [contig.contig for contig in stats]
-    bam.close()
+    with pysam.AlignmentFile(bam, "rb") as bam:
+        stats = bam.get_index_statistics()
+        n_aligns = int(np.sum([contig.mapped for contig in stats]))
+        chroms = [contig.contig for contig in stats]
+
     return n_aligns, chroms
 
 
@@ -88,32 +89,29 @@ def process_bam_entries(args):
     """
     n_reads, chroms = get_bam_info(args.bam)
 
-    bam = pysam.AlignmentFile(args.bam, "rb")
-    bam_out = pysam.AlignmentFile(args.output, "wb", template=bam)
-
-    # If input BAM file is empty or there are no gene assignments,
-    # write an empty output BAM file
-    if (n_reads == 0) or (os.path.getsize(args.gene_assigns) == 0):
-        pass
-    else:
-        logger.info(f"Adding gene tags (GN) to {args.bam}")
-        with open(args.gene_assigns, "r") as f:
-            for align in tqdm(bam.fetch(), total=n_reads):
-                line = f.readline().strip()
-                gene_assigns_id = line.split("\t")[0]
-                gene_assigns_gene = line.split("\t")[3]
-
-                assert (
-                    gene_assigns_id == align.query_name
-                ), "BAM and featureCounts reads not ordered"
-
-                # Annotated gene name = GN:Z
-                align.set_tag("GN", gene_assigns_gene, value_type="Z")
-
-                bam_out.write(align)
-
-    bam.close()
-    bam_out.close()
+    with pysam.AlignmentFile(args.bam, "rb") as bam:
+        with pysam.AlignmentFile(args.output, "wb", template=bam) as bam_out:
+
+            # If input BAM file is empty or there are no gene assignments,
+            # write an empty output BAM file
+            if (n_reads == 0) or (os.path.getsize(args.gene_assigns) == 0):
+                pass
+            else:
+                logger.info(f"Adding gene tags (GN) to {args.bam}")
+                with open(args.gene_assigns, "r") as f:
+                    for align in tqdm(bam.fetch(), total=n_reads):
+                        line = f.readline().strip()
+                        gene_assigns_id = line.split("\t")[0]
+                        gene_assigns_gene = line.split("\t")[3]
+
+                        assert (
+                            gene_assigns_id == align.query_name
+                        ), "BAM and featureCounts reads not ordered"
+
+                        # Annotated gene name = GN:Z
+                        align.set_tag("GN", gene_assigns_gene, value_type="Z")
+
+                        bam_out.write(align)
 
 
 def main(args):