From 495a595ae4ed7e37182d5f5a17ddcc4455c10af0 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 8 Aug 2024 14:48:33 +0800 Subject: [PATCH] add more qc metrics --- docs/tutorials/pbmc.ipynb | 4 +- snapatac2-core/Cargo.toml | 2 +- snapatac2-core/src/preprocessing/bam.rs | 47 ++++--- .../src/preprocessing/bam/mark_duplicates.rs | 123 +++++++++++++----- .../src/preprocessing/count_data.rs | 20 ++- snapatac2-core/src/preprocessing/mod.rs | 4 +- snapatac2-core/src/preprocessing/qc.rs | 96 +++++++++++++- snapatac2-python/Cargo.lock | 6 +- snapatac2-python/Cargo.toml | 4 +- .../python/snapatac2/metrics/__init__.py | 17 ++- .../python/snapatac2/preprocessing/_basic.py | 23 +++- snapatac2-python/src/lib.rs | 1 - snapatac2-python/src/preprocessing.rs | 26 +--- 13 files changed, 273 insertions(+), 100 deletions(-) diff --git a/docs/tutorials/pbmc.ipynb b/docs/tutorials/pbmc.ipynb index 09b110d4c..92d1bd33b 100644 --- a/docs/tutorials/pbmc.ipynb +++ b/docs/tutorials/pbmc.ipynb @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db84e3fd44f45535ab365a7a79769d67a44b79c02a7c7f72ed8f7c12279cf0f9 -size 4305218 +oid sha256:4e8c50b5f003428be053431a3dfb2548cefe7bf1dd5766a193b85e53134527cf +size 4305185 diff --git a/snapatac2-core/Cargo.toml b/snapatac2-core/Cargo.toml index f85dffd98..5a1a26c6d 100644 --- a/snapatac2-core/Cargo.toml +++ b/snapatac2-core/Cargo.toml @@ -10,7 +10,7 @@ homepage = "https://github.com/" keywords = ["single-cell", "biology"] [dependencies] -anndata = "0.4.1" +anndata = "0.4.2" anyhow = "1.0" bigtools = { version = "0.4", features = ["read", "write"] } bed-utils = "0.4.1" diff --git a/snapatac2-core/src/preprocessing/bam.rs b/snapatac2-core/src/preprocessing/bam.rs index e9028ef3e..92022f109 100644 --- a/snapatac2-core/src/preprocessing/bam.rs +++ b/snapatac2-core/src/preprocessing/bam.rs @@ -1,5 +1,5 @@ mod mark_duplicates; -pub use mark_duplicates::{filter_bam, group_bam_by_barcode, BarcodeLocation, FlagStat}; +pub use mark_duplicates::{filter_bam, group_bam_by_barcode, BarcodeLocation, FlagStat, LibraryQC}; use bed_utils::bed::BEDLike; use noodles::{bam, sam::alignment::record::data::field::Tag}; @@ -57,7 +57,7 @@ pub fn make_fragment_file, P2: AsRef, P3: AsRef>( compression: Option, compression_level: Option, temp_dir: Option, -) -> Result { +) -> Result { if barcode_regex.is_some() && barcode_tag.is_some() { bail!("Can only set barcode_tag or barcode_regex but not both"); } @@ -87,39 +87,50 @@ pub fn make_fragment_file, P2: AsRef, P3: AsRef>( let spinner = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr_with_hz(1)) .with_style( ProgressStyle::with_template( - "{spinner} Wrote {human_pos} fragments in {elapsed} ({per_sec}) ...", + "{spinner} Wrote {human_pos} barcodes in {elapsed} ({per_sec}) ...", ) .unwrap(), ); - let mut flagstat = FlagStat::default(); + let mut library_qc = LibraryQC::default(); + let mut num_pcr_duplicates = 0; + let mut num_uniques = 0; + let mut num_barcodes = 0; let filtered_records = filter_bam( reader.records().map(Result::unwrap), is_paired, + &barcode, + umi.as_ref(), mapq, - &mut flagstat, + &mut library_qc, ); group_bam_by_barcode( filtered_records, - &barcode, - umi.as_ref(), is_paired, temp_dir, chunk_size, ) .into_fragments(&header) .progress_with(spinner) - .for_each(|mut rec| { - if rec.strand().is_none() { - let new_start = rec.start().saturating_add_signed(shift_left); - let new_end = rec.end().saturating_add_signed(shift_right); - if new_start < new_end { - rec.set_start(new_start); - rec.set_end(new_end); + .for_each(|barcode| { + num_barcodes += 1; + barcode.into_iter().for_each(|mut rec| { + num_pcr_duplicates += rec.count as u64 - 1; + num_uniques += 1; + if rec.strand().is_none() { + let new_start = rec.start().saturating_add_signed(shift_left); + let new_end = rec.end().saturating_add_signed(shift_right); + if new_start < new_end { + rec.set_start(new_start); + rec.set_end(new_end); + writeln!(output, "{}", rec).unwrap(); + } + } else { writeln!(output, "{}", rec).unwrap(); } - } else { - writeln!(output, "{}", rec).unwrap(); - } + }); }); - Ok(flagstat) + library_qc.num_pcr_duplicates = num_pcr_duplicates; + library_qc.num_uniques = num_uniques; + library_qc.num_barcodes = num_barcodes; + Ok(library_qc) } \ No newline at end of file diff --git a/snapatac2-core/src/preprocessing/bam/mark_duplicates.rs b/snapatac2-core/src/preprocessing/bam/mark_duplicates.rs index c30403291..afef95503 100644 --- a/snapatac2-core/src/preprocessing/bam/mark_duplicates.rs +++ b/snapatac2-core/src/preprocessing/bam/mark_duplicates.rs @@ -115,8 +115,8 @@ pub struct AlignmentInfo { impl AlignmentInfo { fn new( rec: &Record, - barcode_loc: &BarcodeLocation, - umi_loc: Option<&BarcodeLocation>, + barcode: Option, + umi: Option, ) -> Result { let cigar = rec.cigar(); let start: usize = rec.alignment_start().unwrap().unwrap().try_into()?; @@ -147,8 +147,8 @@ impl AlignmentInfo { unclipped_start: alignment_start - clipped_start, unclipped_end: alignment_end + clipped_end, sum_of_qual_scores: sum_of_qual_score(rec), - barcode: barcode_loc.extract(rec).ok(), - umi: umi_loc.and_then(|x| x.extract(rec).ok()), + barcode, + umi, }) } @@ -259,6 +259,57 @@ impl FingerPrint { } } +#[derive(Debug, Default)] +pub struct LibraryQC { + pub all_reads_flagstat: FlagStat, + pub barcoded_reads_flagstat: FlagStat, + pub num_pcr_duplicates: u64, + pub num_uniques: u64, + pub num_barcodes: u64, +} + +impl LibraryQC { + /// Report the quality control metrics. + /// The metrics are: + /// - Sequenced_reads: number of reads in the input BAM file. + /// - Sequenced_read_pairs: number of read pairs in the input BAM file. + /// - Fraction_duplicates: Fraction of high-quality read pairs that are deemed + /// to be PCR duplicates. This metric is a measure of + /// sequencing saturation and is a function of library + /// complexity and sequencing depth. More specifically, + /// this is the fraction of high-quality fragments with a + /// valid barcode that align to the same genomic position + /// as another read pair in the library. + /// - Fraction_unmapped_reads: Fraction of sequenced reads that have + /// a valid barcode but could not be mapped to the genome. + /// - Fraction_unmapped_read_pairs: Fraction of sequenced read pairs that have + /// a valid barcode but could not be mapped to the genome. + /// - Raw_reads_per_cell: Total number of reads divided by the number of cell barcodes. + /// - Raw_read_pairs_per_cell: Total number of read pairs divided by the number of cell barcodes. + /// - Fraction_reads_in_cell: fraction of reads that are associated with a cell barcode. + /// - Fraction_read_pairs_in_cell: fraction of read pairs that are associated with a cell barcode. + pub fn report(&self) -> HashMap { + let mut result = HashMap::new(); + let flagstat_all = &self.all_reads_flagstat; + let flagstat_barcoded = &self.barcoded_reads_flagstat; + let num_reads = flagstat_all.read as f64; + let num_pairs = flagstat_all.read_1.min(flagstat_all.read_2) as f64; + let num_barcoded_reads = flagstat_barcoded.read as f64; + let num_barcoded_pairs = flagstat_barcoded.read_1.min(flagstat_barcoded.read_2) as f64; + let mapped_pairs = flagstat_barcoded.proper_pair as f64 / 2.0; + result.insert("Sequenced_reads".to_string(), num_reads); + result.insert("Sequenced_read_pairs".to_string(), num_pairs); + result.insert("Fraction_duplicates".to_string(), self.num_pcr_duplicates as f64 / (self.num_uniques + self.num_pcr_duplicates) as f64); + result.insert("Fraction_unmapped_reads".to_string(), (num_barcoded_reads - flagstat_barcoded.mapped as f64) / num_barcoded_reads); + result.insert("Fraction_unmapped_read_pairs".to_string(), 1.0 - mapped_pairs / num_barcoded_pairs); + result.insert("Raw_reads_per_cell".to_string(), num_reads / self.num_barcodes as f64); + result.insert("Raw_read_pairs_per_cell".to_string(), num_pairs / self.num_barcodes as f64); + result.insert("Fraction_reads_in_cell".to_string(), num_barcoded_reads / num_reads); + result.insert("Fraction_read_pairs_in_cell".to_string(), num_barcoded_pairs / num_pairs); + result + } +} + /// BAM record statistics. #[derive(Debug, Default)] pub struct FlagStat { @@ -271,6 +322,7 @@ pub struct FlagStat { pub mapped: u64, pub primary_mapped: u64, pub paired: u64, + pub paired_hq: u64, pub read_1: u64, pub read_2: u64, pub proper_pair: u64, @@ -281,7 +333,7 @@ pub struct FlagStat { } impl FlagStat { - pub fn update(&mut self, record: &Record) { + pub fn update(&mut self, record: &Record, is_hq: bool) { let flags = record.flags(); self.read += 1; @@ -311,6 +363,9 @@ impl FlagStat { if flags.is_segmented() { self.paired += 1; + if is_hq { + self.paired_hq += 1; + } if flags.is_first_segment() { self.read_1 += 1; @@ -334,12 +389,7 @@ impl FlagStat { if mat_id != rec_id { self.mate_reference_sequence_id_mismatch += 1; - - let mapq = record - .mapping_quality() - .map_or(255, |x| x.get()); - - if mapq >= 5 { + if is_hq { self.mate_reference_sequence_id_mismatch_hq += 1; } } @@ -354,9 +404,11 @@ impl FlagStat { pub fn filter_bam<'a, I>( reads: I, is_paired: bool, + barcode_loc: &'a BarcodeLocation, + umi_loc: Option<&'a BarcodeLocation>, mapq_filter: Option, - flagstat: &'a mut FlagStat, -) -> impl Iterator + 'a + qc: &'a mut LibraryQC, +) -> impl Iterator + 'a where I: Iterator + 'a, { @@ -367,34 +419,41 @@ where // - read fails platform/vendor quality checks // - read is PCR or optical duplicate let flag_failed = Flags::from_bits(1804).unwrap(); - reads.filter(move |r| { - flagstat.update(r); + reads.filter_map(move |r| { + let is_hq = mapq_filter.map_or(true, |min_q| { + let q = r.mapping_quality().map_or(255, |x| x.get()); + q >= min_q + }); + qc.all_reads_flagstat.update(&r, is_hq); + + let barcode = barcode_loc.extract(&r).ok(); + let umi = umi_loc.and_then(|x| x.extract(&r).ok()); + if barcode.is_some() { + qc.barcoded_reads_flagstat.update(&r, is_hq); + } + let flag = r.flags(); let is_properly_aligned = !flag.is_supplementary() && (!is_paired || flag.is_properly_segmented()); let flag_pass = !flag.intersects(flag_failed); - let mapq_pass = mapq_filter.map_or(true, |min_q| { - let q = r.mapping_quality().map_or(255, |x| x.get()); - q >= min_q - }); - is_properly_aligned && flag_pass && mapq_pass + if is_properly_aligned && flag_pass && is_hq && barcode.is_some() { + let alignment = AlignmentInfo::new(&r, barcode, umi).unwrap(); + Some(alignment) + } else { + None + } }) } /// Sort and group BAM -pub fn group_bam_by_barcode<'a, I, P>( +pub fn group_bam_by_barcode( reads: I, - barcode_loc: &'a BarcodeLocation, - umi_loc: Option<&'a BarcodeLocation>, is_paired: bool, temp_dir: Option

, chunk_size: usize, -) -> RecordGroups< - impl Iterator + 'a, - impl FnMut(&AlignmentInfo) -> String + 'a -> +) -> RecordGroups, impl FnMut(&AlignmentInfo) -> String> where - I: Iterator + 'a, + I: Iterator, P: AsRef, { let mut sorter = ExternalSorterBuilder::new() @@ -403,10 +462,8 @@ where if let Some(tmp) = temp_dir { sorter = sorter.with_tmp_dir(tmp); } - let input = reads.map(move |x| AlignmentInfo::new(&x, barcode_loc, umi_loc).unwrap()) - .filter(|x| x.barcode.is_some()); let groups = sorter.build().unwrap() - .sort_by(input.map(|x| std::io::Result::Ok(x)), |a, b| a.barcode.cmp(&b.barcode) + .sort_by(reads.map(|x| std::io::Result::Ok(x)), |a, b| a.barcode.cmp(&b.barcode) .then_with(|| a.unclipped_start.cmp(&b.unclipped_start)) .then_with(|| a.unclipped_end.cmp(&b.unclipped_end)) ).unwrap() @@ -430,8 +487,8 @@ where I: Iterator, F: FnMut(&AlignmentInfo) -> String, { - pub fn into_fragments<'a>(&'a self, header: &'a Header) -> impl Iterator + '_ { - self.groups.into_iter().flat_map(|(_, rec)| get_unique_fragments(rec, header, self.is_paired)) + pub fn into_fragments<'a>(&'a self, header: &'a Header) -> impl Iterator> + '_ { + self.groups.into_iter().map(|(_, rec)| get_unique_fragments(rec, header, self.is_paired)) } } diff --git a/snapatac2-core/src/preprocessing/count_data.rs b/snapatac2-core/src/preprocessing/count_data.rs index eba1532d8..f1fe05a63 100644 --- a/snapatac2-core/src/preprocessing/count_data.rs +++ b/snapatac2-core/src/preprocessing/count_data.rs @@ -22,7 +22,7 @@ use polars::frame::DataFrame; use nalgebra_sparse::CsrMatrix; use anyhow::{Result, Context, bail}; use num::integer::div_ceil; -use std::str::FromStr; +use std::{str::FromStr, sync::{Arc, Mutex}}; /// The `SnapData` trait represents an interface for reading and /// manipulating single-cell assay data. It extends the `AnnDataOp` trait, @@ -71,12 +71,18 @@ pub trait SnapData: AnnDataOp { /// QC metrics for the data. /// Compute TSS enrichment. - fn tss_enrichment(&self, promoter: &BedTree) -> Result> { - Ok(self.get_count_iter(2000)?.into_fragments().flat_map(|(fragments, _, _)| { - fragments.into_par_iter() - .map(|x| qc::tss_enrichment(x.into_iter(), promoter)) - .collect::>() - }).collect()) + fn tss_enrichment(&self, promoter: &qc::TssRegions) -> Result<(Vec, (f64, f64))> { + let library_tsse = Arc::new(Mutex::new(qc::TSSe::new(promoter))); + let scores = self.get_count_iter(2000)?.into_fragments().flat_map(|(list_of_fragments, _, _)| { + list_of_fragments.into_par_iter().map(|fragments| { + let mut tsse = qc::TSSe::new(promoter); + fragments.into_iter().for_each(|x| tsse.add(&x)); + library_tsse.lock().unwrap().add_from(&tsse); + tsse.result().0 + }).collect::>() + }).collect(); + let tsse = library_tsse.lock().unwrap().result(); + Ok((scores, tsse)) } /// Compute the fragment size distribution. diff --git a/snapatac2-core/src/preprocessing/mod.rs b/snapatac2-core/src/preprocessing/mod.rs index c53707139..a025fc6d7 100644 --- a/snapatac2-core/src/preprocessing/mod.rs +++ b/snapatac2-core/src/preprocessing/mod.rs @@ -7,5 +7,5 @@ pub use count_data::{import_fragments, import_contacts, Promoters, Transcript, create_gene_matrix, create_tile_matrix, create_peak_matrix, GenomeCount, ContactMap, SnapData, }; -pub use bam::{make_fragment_file, FlagStat}; -pub use qc::{Fragment, Contact, CellBarcode, read_tss, make_promoter_map, get_barcode_count}; \ No newline at end of file +pub use bam::{make_fragment_file, FlagStat, LibraryQC}; +pub use qc::{Fragment, Contact, CellBarcode, read_tss, TssRegions, make_promoter_map, get_barcode_count}; \ No newline at end of file diff --git a/snapatac2-core/src/preprocessing/qc.rs b/snapatac2-core/src/preprocessing/qc.rs index 37739d65b..7befe197d 100644 --- a/snapatac2-core/src/preprocessing/qc.rs +++ b/snapatac2-core/src/preprocessing/qc.rs @@ -238,11 +238,31 @@ pub fn read_tss(file: R) -> impl Iterator { }) } +pub struct TssRegions { + pub promoters: BedTree, + pub tss: BedTree, + pub size: u64, +} + +impl TssRegions { + pub fn new>(iter: I, half_window_size: u64) -> Self { + let values: Vec<_> = iter.into_iter().collect(); + let tss = values.iter().map( |(chr, x, is_fwd)| { + let b = GenomicRange::new(chr, *x as u64, *x as u64 + 1); + (b, *is_fwd) + }).collect(); + let promoters = values.into_iter().map( |(chr, tss, is_fwd)| { + let b = GenomicRange::new(chr, tss.saturating_sub(half_window_size), tss + half_window_size + 1); + (b, is_fwd) + }).collect(); + Self { promoters, tss, size: 2 * half_window_size } + } +} -pub fn make_promoter_map>(iter: I) -> BedTree { +pub fn make_promoter_map>(iter: I, half_window_size: u64) -> BedTree { iter .map( |(chr, tss, is_fwd)| { - let b = GenomicRange::new(chr, tss.saturating_sub(2000), tss + 2001); + let b = GenomicRange::new(chr, tss.saturating_sub(half_window_size), tss + half_window_size + 1); (b, is_fwd) }).collect() } @@ -260,6 +280,78 @@ where barcodes } +pub(crate) struct TSSe<'a> { + promoters: &'a TssRegions, + counts: Vec, + n_overlapping: u64, + n_total: u64, +} + +impl<'a> TSSe<'a> { + pub fn new(promoters: &'a TssRegions) -> Self { + Self { counts: vec![0; promoters.size as usize + 1], n_overlapping: 0, n_total: 0, promoters } + } + + pub fn reset(mut self) -> Self { + self.counts.fill(0); + self.n_overlapping = 0; + self.n_total = 0; + self + } + + pub fn add(&mut self, bed: &Fragment) { + fn find_pos(tss: &mut TSSe, ins: &GenomicRange) { + tss.promoters.promoters.find(ins).map(|(entry, data)| { + let pos: u64 = + if *data { + ins.start() - entry.start() + } else { + tss.promoters.size as u64 - (entry.end() - 1 - ins.start()) + }; + pos as usize + }).for_each(|pos| tss.counts[pos] += 1); + } + + self.n_total += 1; + if self.promoters.tss.is_overlapped(bed) { + self.n_overlapping += 1; + } + match bed.strand { + None => { + let p1 = GenomicRange::new(bed.chrom(), bed.start(), bed.start() + 1); + let p2 = GenomicRange::new(bed.chrom(), bed.end() - 1, bed.end()); + find_pos(self, &p1); + find_pos(self, &p2); + }, + Some(Strand::Forward) => { + let p = GenomicRange::new(bed.chrom(), bed.start(), bed.start() + 1); + find_pos(self, &p); + }, + Some(Strand::Reverse) => { + let p = GenomicRange::new(bed.chrom(), bed.end() - 1, bed.end()); + find_pos(self, &p); + }, + } + } + + pub fn add_from(&mut self, tsse: &TSSe) { + self.n_overlapping += tsse.n_overlapping; + self.n_total += tsse.n_total; + self.counts.iter_mut().zip(tsse.counts.iter()).for_each(|(a, b)| *a += b); + } + + pub fn result(&self) -> (f64, f64) { + let counts = &self.counts; + let bg_count: f64 = + ( counts[ .. 100].iter().sum::() + + counts[3901 .. 4001].iter().sum::() ) as f64 / + 200.0 + 0.1; + let tss_enrichment = moving_average(5, &counts) + .max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap().div(bg_count); + (tss_enrichment, self.n_overlapping as f64 / self.n_total as f64) + } +} + pub fn tss_enrichment(fragments: I, promoter: &BedTree) -> f64 where I: Iterator, diff --git a/snapatac2-python/Cargo.lock b/snapatac2-python/Cargo.lock index ac5bd6311..07e62ea29 100644 --- a/snapatac2-python/Cargo.lock +++ b/snapatac2-python/Cargo.lock @@ -62,9 +62,9 @@ dependencies = [ [[package]] name = "anndata" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7908551ce9d7d10e1ec90eaa697951eae25421c215c8dcb8cd8cd75270b0111f" +checksum = "0031108bbf0fa6c902d7970c8a4ab3b16aac45d878c754b8c1840e604014886a" dependencies = [ "anyhow", "flate2", @@ -3162,7 +3162,7 @@ dependencies = [ [[package]] name = "snapatac2" -version = "2.6.4" +version = "2.6.5-dev0" dependencies = [ "anndata", "anndata-hdf5", diff --git a/snapatac2-python/Cargo.toml b/snapatac2-python/Cargo.toml index d1e07b14b..6c39bfd4f 100644 --- a/snapatac2-python/Cargo.toml +++ b/snapatac2-python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "snapatac2" -version = "2.6.4" +version = "2.6.5-dev0" edition = "2021" authors = ["Kai Zhang "] description = "Rust APIs" @@ -12,7 +12,7 @@ keywords = ["single-cell", "biology"] [dependencies] snapatac2-core = { path = "../snapatac2-core" } -anndata = "0.4.1" +anndata = "0.4.2" anndata-hdf5 = "0.3" pyanndata = "0.4.1" anyhow = "1.0" diff --git a/snapatac2-python/python/snapatac2/metrics/__init__.py b/snapatac2-python/python/snapatac2/metrics/__init__.py index ccd373aaa..2c32c0647 100644 --- a/snapatac2-python/python/snapatac2/metrics/__init__.py +++ b/snapatac2-python/python/snapatac2/metrics/__init__.py @@ -38,9 +38,11 @@ def tsse( Returns ------- - np.ndarray | list[np.ndarray] | None - If `inplace = True`, directly adds the results to `adata.obs['tsse']`. - Otherwise return the results. + tuple[np.ndarray, tuple[float, float]] | list[tuple[np.ndarray, tuple[float, float]]] | None + If `inplace = True`, cell-level TSSe scores are computed and stored in `adata.obs['tsse']`. + Library-level TSSe scores are stored in `adata.uns['library_tsse']`. + Fraction of fragments overlapping TSS are stored in `adata.uns['fraction_of_fragments_overlapping_TSS']`. + If `inplace = False`, return a tuple containing all these values. Examples -------- @@ -64,13 +66,16 @@ def tsse( n_jobs=n_jobs, ) else: - result = np.array(internal.tss_enrichment(adata, gene_anno, exclude_chroms)) + tsse, (library_tsse, fr_tss) = internal.tss_enrichment(adata, gene_anno, exclude_chroms) + tsse = np.array(tsse) if inplace: - adata.obs["tsse"] = result + adata.obs["tsse"] = tsse + adata.uns['library_tsse'] = library_tsse + adata.uns['fraction_of_fragments_overlapping_TSS'] = fr_tss if inplace: return None else: - return result + return tsse, (library_tsse, fr_tss) def frip( adata: internal.AnnData | list[internal.AnnData], diff --git a/snapatac2-python/python/snapatac2/preprocessing/_basic.py b/snapatac2-python/python/snapatac2/preprocessing/_basic.py index b1f479503..316e6f05d 100644 --- a/snapatac2-python/python/snapatac2/preprocessing/_basic.py +++ b/snapatac2-python/python/snapatac2/preprocessing/_basic.py @@ -29,7 +29,7 @@ def make_fragment_file( compression: Literal["gzip", "zstandard"] | None = None, compression_level: int | None = None, tempdir: Path | None = None, -) -> internal.PyFlagStat: +) -> dict[str, float]: """ Convert a BAM file to a fragment file. @@ -101,8 +101,25 @@ def make_fragment_file( Returns ------- - PyFlagStat - Various statistics. + dict[str, float] + A dictionary containing the following metrics: + - Sequenced_reads: number of reads in the input BAM file. + - Sequenced_read_pairs: number of read pairs in the input BAM file. + - Fraction_duplicates: Fraction of high-quality read pairs that are deemed + to be PCR duplicates. This metric is a measure of + sequencing saturation and is a function of library + complexity and sequencing depth. More specifically, + this is the fraction of high-quality fragments with a + valid barcode that align to the same genomic position + as another read pair in the library. + - Fraction_unmapped_reads: Fraction of sequenced reads that have + a valid barcode but could not be mapped to the genome. + - Fraction_unmapped_read_pairs: Fraction of sequenced read pairs that have + a valid barcode but could not be mapped to the genome. + - Raw_reads_per_cell: Total number of reads divided by the number of cell barcodes. + - Raw_read_pairs_per_cell: Total number of read pairs divided by the number of cell barcodes. + - Fraction_reads_in_cell: fraction of reads that are associated with a cell barcode. + - Fraction_read_pairs_in_cell: fraction of read pairs that are associated with a cell barcode. See Also -------- diff --git a/snapatac2-python/src/lib.rs b/snapatac2-python/src/lib.rs index bc58c72cc..45f4a83dd 100644 --- a/snapatac2-python/src/lib.rs +++ b/snapatac2-python/src/lib.rs @@ -37,7 +37,6 @@ fn _snapatac2(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(motif::read_motifs, m)?)?; // Preprocessing related functions - m.add_class::().unwrap(); m.add_function(wrap_pyfunction!(preprocessing::make_fragment_file, m)?)?; m.add_function(wrap_pyfunction!(preprocessing::import_fragments, m)?)?; m.add_function(wrap_pyfunction!(preprocessing::import_contacts, m)?)?; diff --git a/snapatac2-python/src/preprocessing.rs b/snapatac2-python/src/preprocessing.rs index 2497b4ee2..2415e6cc0 100644 --- a/snapatac2-python/src/preprocessing.rs +++ b/snapatac2-python/src/preprocessing.rs @@ -6,6 +6,7 @@ use anndata::Backend; use anndata_hdf5::H5; use snapatac2_core::preprocessing::count_data::TranscriptParserOptions; use snapatac2_core::preprocessing::count_data::CountingStrategy; +use std::collections::HashMap; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::{str::FromStr, collections::BTreeMap, ops::Deref, collections::HashSet}; @@ -15,25 +16,10 @@ use pyanndata::PyAnnData; use anyhow::Result; use snapatac2_core::{ - preprocessing::{Fragment, Contact, FlagStat, SnapData}, + preprocessing::{Fragment, Contact, SnapData}, preprocessing, utils, }; -#[pyclass] -pub(crate) struct PyFlagStat(FlagStat); - -#[pymethods] -impl PyFlagStat { - #[getter] - fn num_reads(&self) -> u64 { self.0.read } - - fn __repr__(&self) -> String { - format!("{:?}", self.0) - } - - fn __str__(&self) -> String { self.__repr__() } -} - #[pyfunction] pub(crate) fn make_fragment_file( bam_file: PathBuf, @@ -50,7 +36,7 @@ pub(crate) fn make_fragment_file( compression: Option<&str>, compression_level: Option, temp_dir: Option, -) -> Result +) -> Result> { fn parse_tag(tag: &str) -> [u8; 2] { let tag_b = tag.as_bytes(); @@ -67,7 +53,7 @@ pub(crate) fn make_fragment_file( shift_left, shift_right, mapq, chunk_size, compression.map(|x| utils::Compression::from_str(x).unwrap()), compression_level, temp_dir, )?; - Ok(PyFlagStat(stat)) + Ok(stat.report()) } #[pyfunction] @@ -345,7 +331,7 @@ pub(crate) fn tss_enrichment( anndata: AnnDataLike, gtf_file: PathBuf, exclude_chroms: Option>, -) -> Result> +) -> Result<(Vec, (f64, f64))> { let exclude_chroms = match exclude_chroms { Some(chrs) => chrs.into_iter().collect(), @@ -354,7 +340,7 @@ pub(crate) fn tss_enrichment( let tss = preprocessing::read_tss(utils::open_file_for_read(gtf_file)) .unique() .filter(|(chr, _, _)| !exclude_chroms.contains(chr)); - let promoters = preprocessing::make_promoter_map(tss); + let promoters = preprocessing::TssRegions::new(tss, 2000); macro_rules! run { ($data:expr) => {