Skip to content

Commit

Permalink
Merge branch 'performance_cw1049' into 'dev'
Browse files Browse the repository at this point in the history
Performance cw1049

See merge request epi2melabs/workflows/wf-single-cell!25
  • Loading branch information
nrhorner committed Oct 26, 2022
2 parents 1c28807 + a0a6d82 commit 3cfbf38
Show file tree
Hide file tree
Showing 31 changed files with 1,122 additions and 1,837 deletions.
14 changes: 13 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.1.4]
### Fixed
- Fix transcript matrices not in output folder.
### Added
- output of merged bam optional.
- Repeat umap creation with different random states.
### Changed
- Transcript counting Salmon on stringtie-generated transcriptome.
- Several perforance-related reforactorings including reductions in read write operations.
- single_cell_sample_sheet is optional and kit options can be supplied as workflow parameters.


## [v0.1.3]
## Changed
- Better handling of sample_id conflicts in single_cell_sampkle_sheet and fastgingress.
Expand All @@ -13,7 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- kit options can be supplied from command line/config and applied to all samples.

## [v0.1.2]
## Added
### Added
- Transcript x cell matrix output.

## [v0.1.1]
Expand Down
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ The most useful outputs of the pipeline are likely:
- ``n_plus``: number of reads with the read1-->TSO configuration
- ``n_minus``: number of reads with the TSO'-->read1' configuration

* ``tagged.sorted.bam``: BAM file of alignments to the reference where each alignment contains the following sequence tags
* ``bams``: Folder of bam alignment files where each alignment contains the following sequence tags

- CB: corrected cell barcode sequence
- CR: uncorrected cell barcode sequence
Expand All @@ -141,13 +141,26 @@ The most useful outputs of the pipeline are likely:
- UR: uncorrected UMI sequence
- UY: Phred quality scores of the uncorrected UMI sequence


The bam files are output per chromosome (default) unless `--merge_bam` is set.

* ``gene_expression.processed.tsv``: TSV containing the gene (rows) x cell (columns) expression matrix, processed and normalized according to:

- ``matrix_min_genes``: cells with fewer than this number of expressed genes will be removed
- ``matrix_min_cells``: genes present in fewer than this number of cells will be removed
- ``matrix_max_mito``: cells with more than this percentage of counts belonging to mitochondrial genes will be removed
- ``matrix_norm_count``: normalize all cells to this number of total counts per cell

* ``umap``:
This folder contains umap projections and the data file used to generate them.
As UMAP is a stochastic algorithm, different runs with using the same parameters can lead
to different results. Therefore for each expression results matrix, 10 UMAP matrices and plots are gerenated each with a different initial random state. The following UMAP results are predrnt in this folder:
- \*genes\*.png: UMAP derived from the expression of all genes across the cells.
- \*gene.{gene_name}\*.png The same plot as above but coulored by gene expression for each gene in the file defined by `umap_plot_genes`.
- \*transcripts\*.png UMAP created from expression level of all transcripts.
- \*mitochondrial\*.png: UMAP created from expression level of all mitochondrial genes.



* ``processed_transcript_matrix.tsv``: TSV containing the transcript (rows) x cell (columns) expression matrix in transcript per million (TPM):
These expression values are determined by applying [stringtie](https://ccb.jhu.edu/software/stringtie/) and
Expand Down
17 changes: 8 additions & 9 deletions bin/adapter_scan_vsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
import multiprocessing
import os
import pathlib
from pathlib import Path
import shutil
import subprocess
import sys
Expand All @@ -27,22 +27,22 @@ def parse_args():
parser = argparse.ArgumentParser()

# Positional mandatory arguments
parser.add_argument("fastq", help="FASTQ of ONT reads", type=str)
parser.add_argument("fastq", help="FASTQ of ONT reads", type=Path)

# Optional arguments
parser.add_argument(
"--output_fastq",
help="Output file name for (gzipped) stranded FASTQ entries \
[stranded.fastq.gz]",
type=str,
type=Path,
default="stranded.fastq.gz",
)

parser.add_argument(
"--output_tsv",
help="Output file name for adapter configurations \
[adapters.tsv]",
type=str,
type=Path,
default="adapters.tsv",
)

Expand All @@ -66,7 +66,6 @@ def parse_args():
determines which adapter sequences to search for in the reads \
[3prime]",
default="3prime",
type=str,
)

parser.add_argument(
Expand All @@ -91,7 +90,7 @@ def parse_args():
"--adapters_fasta",
help="Filename for adapter query sequences \
[adapter_seqs.fasta]",
type=str,
type=Path,
default="adapter_seqs.fasta",
)

Expand Down Expand Up @@ -125,7 +124,7 @@ def parse_args():
args.adapter2_seq = "GTACTCTGCGTTGATACCACTGCTT"

# Create temp dir and add that to the args object
p = pathlib.Path(args.output_tsv)
p = Path(args.output_tsv)
tempdir = tempfile.TemporaryDirectory(prefix="tmp.", dir=p.parents[0])
args.tempdir = tempdir.name

Expand Down Expand Up @@ -487,10 +486,10 @@ def write_stranded_fastq(tmp_fastq, read_info, args):

def open_fastq(fastq):
"""Open fastq."""
if args.fastq.split(".")[-1] == "gz":
if fastq.suffix == ".gz":
f = gzip.open(fastq, "rt")
else:
f = open(fastq)
f = fastq.open()
return f


Expand Down
66 changes: 32 additions & 34 deletions bin/add_gene_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import argparse
import logging
import os
from pathlib import Path

import numpy as np
import pysam
Expand All @@ -17,22 +18,22 @@ def parse_args():
parser = argparse.ArgumentParser()

# Positional mandatory arguments
parser.add_argument("bam", help="Sorted BAM file", type=str)
parser.add_argument("bam", help="Sorted BAM file", type=Path)

parser.add_argument(
"gene_assigns",
help="TSV read/gene assignments file. \
IMPORTANT: reads in the input BAM and gene_assigns file must have the \
same order.",
type=str,
type=Path,
)

# Optional arguments
parser.add_argument(
"--output",
help="Output BAM file containing aligned reads with gene name \
tags (GN) [gene.sorted.bam]",
type=str,
type=Path,
default="gene.sorted.bam",
)

Expand Down Expand Up @@ -67,11 +68,11 @@ def get_bam_info(bam):
:return: Sum of all alignments in the BAM index file and list of all chroms
:rtype: int,list
"""
bam = pysam.AlignmentFile(bam, "rb")
stats = bam.get_index_statistics()
n_aligns = int(np.sum([contig.mapped for contig in stats]))
chroms = [contig.contig for contig in stats]
bam.close()
with pysam.AlignmentFile(bam, "rb") as bam:
stats = bam.get_index_statistics()
n_aligns = int(np.sum([contig.mapped for contig in stats]))
chroms = [contig.contig for contig in stats]

return n_aligns, chroms


Expand All @@ -88,32 +89,29 @@ def process_bam_entries(args):
"""
n_reads, chroms = get_bam_info(args.bam)

bam = pysam.AlignmentFile(args.bam, "rb")
bam_out = pysam.AlignmentFile(args.output, "wb", template=bam)

# If input BAM file is empty or there are no gene assignments,
# write an empty output BAM file
if (n_reads == 0) or (os.path.getsize(args.gene_assigns) == 0):
pass
else:
logger.info(f"Adding gene tags (GN) to {args.bam}")
with open(args.gene_assigns, "r") as f:
for align in tqdm(bam.fetch(), total=n_reads):
line = f.readline().strip()
gene_assigns_id = line.split("\t")[0]
gene_assigns_gene = line.split("\t")[3]

assert (
gene_assigns_id == align.query_name
), "BAM and featureCounts reads not ordered"

# Annotated gene name = GN:Z
align.set_tag("GN", gene_assigns_gene, value_type="Z")

bam_out.write(align)

bam.close()
bam_out.close()
with pysam.AlignmentFile(args.bam, "rb") as bam:
with pysam.AlignmentFile(args.output, "wb", template=bam) as bam_out:

# If input BAM file is empty or there are no gene assignments,
# write an empty output BAM file
if (n_reads == 0) or (os.path.getsize(args.gene_assigns) == 0):
pass
else:
logger.info(f"Adding gene tags (GN) to {args.bam}")
with open(args.gene_assigns, "r") as f:
for align in tqdm(bam.fetch(), total=n_reads):
line = f.readline().strip()
gene_assigns_id = line.split("\t")[0]
gene_assigns_gene = line.split("\t")[3]

assert (
gene_assigns_id == align.query_name
), "BAM and featureCounts reads not ordered"

# Annotated gene name = GN:Z
align.set_tag("GN", gene_assigns_gene, value_type="Z")

bam_out.write(align)


def main(args):
Expand Down
Loading

0 comments on commit 3cfbf38

Please sign in to comment.