rename export_bed to export_fragments

kaizhang · Oct 23, 2023 · f842368 · f842368
1 parent f022a41
commit f842368
Show file tree

Hide file tree

Showing 12 changed files with 70 additions and 21 deletions.
diff --git a/docs/_static/images/func+import_data.svg b/docs/_static/images/func+import_data.svg
diff --git a/docs/api/export.rst b/docs/api/export.rst
@@ -6,5 +6,5 @@ Exporting: `ex`
 .. autosummary::
     :toctree: _autosummary
 
-    ex.export_bed
+    ex.export_fragments
     ex.export_bigwig
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -3,6 +3,10 @@ Release Notes
 
 ## Development version (unreleased)
 
+### Function renaming:
+
+- `ex.export_bed` is renamed to `ex.export_fragments`.
+
 ### Features:
 
 - Add zstandard file support at various places.

diff --git a/snapatac2-core/Cargo.toml b/snapatac2-core/Cargo.toml
@@ -13,7 +13,7 @@ keywords = ["single-cell", "biology"]
 [dependencies]
 anndata = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "730eea8495ba1884ea50308e5f03ab0cb60f9ae8" }
 anyhow = "1.0"
-bigtools = "0.2"
+bigtools = { version = "0.2", features = ["read", "write"] }
 bincode = "1.3"
 bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "d06f2b4ff189c64bc0d3f7054f4ee8d0797368ca" }
 extsort = "0.4"

diff --git a/snapatac2-core/src/export.rs b/snapatac2-core/src/export.rs
@@ -1,4 +1,4 @@
-use crate::{preprocessing::count_data::{SnapData, GenomeCoverage, CoverageType}, utils::open_file_for_write};
+use crate::{preprocessing::{count_data::{SnapData, GenomeCoverage, CoverageType, ChromSizes}, Fragment}, utils::open_file_for_write};
 
 use anyhow::{Context, Result, ensure};
 use itertools::Itertools;
@@ -8,15 +8,15 @@ use std::{
     path::{Path, PathBuf}, collections::{BTreeMap, HashMap, HashSet},
 };
 use rayon::iter::{ParallelBridge, ParallelIterator};
-use bed_utils::bed::{BEDLike, BedGraph, merge_sorted_bed_with};
+use bed_utils::bed::{BEDLike, BedGraph, merge_sorted_bed_with, GenomicRange, tree::{GenomeRegions, SparseBinnedCoverage}};
 use bigtools::{bbi::bigwigwrite::BigWigWrite, bed::bedparser::BedParser};
 use futures::executor::ThreadPool;
 use indicatif::{ProgressIterator, style::ProgressStyle};
 
 impl<T> Exporter for T where T: SnapData {}
 
 pub trait Exporter: SnapData {
-    fn export_bed<P: AsRef<Path>>(
+    fn export_fragments<P: AsRef<Path>>(
         &self,
         barcodes: Option<&Vec<&str>>,
         group_by: &Vec<&str>,
@@ -181,8 +181,18 @@ where
     }).collect()
 }
 
-//fn create_bedgraph_from_fragments() {
-//}
+fn create_bedgraph_from_fragments<I>(fragments: I, chrom_sizes: &ChromSizes, bin_size: u64)
+where
+    I: Iterator<Item = Fragment>,
+{
+    let genome: GenomeRegions<GenomicRange> = chrom_sizes.into_iter()
+        .map(|(k, v)| GenomicRange::new(k, 0, *v)).collect();
+    let mut coverage = SparseBinnedCoverage::new(&genome, bin_size);
+    fragments.for_each(|frag| frag.to_reads().into_iter().for_each(|read|
+        coverage.insert(&read, 1.0)
+    ));
+    todo!()
+}
 
 /// Create a bigwig file from BedGraph records.
 fn create_bigwig_from_bedgraph<P: AsRef<Path>>(

diff --git a/snapatac2-core/src/preprocessing/count_data/genome.rs b/snapatac2-core/src/preprocessing/count_data/genome.rs
@@ -376,6 +376,15 @@ where
     }
 }
 
+impl<'a> IntoIterator for &'a ChromSizes {
+    type Item = (&'a String, &'a u64);
+    type IntoIter = indexmap::map::Iter<'a, String, u64>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.iter()
+    }
+}
+
 impl IntoIterator for ChromSizes {
     type Item = (String, u64);
     type IntoIter = indexmap::map::IntoIter<String, u64>;

diff --git a/snapatac2-python/snapatac2/export/__init__.py b/snapatac2-python/snapatac2/export/__init__.py
@@ -6,7 +6,7 @@
 
 import snapatac2._snapatac2 as internal
 
-def export_bed(
+def export_fragments(
     adata: internal.AnnData | internal.AnnDataSet,
     groupby: str | list[str],
     selections: list[str] | None = None,
@@ -66,7 +66,7 @@ def export_bed(
         else:
             raise ValueError("Compression type must be: gzip or zstandard.")
 
-    return internal.export_bed(
+    return internal.export_fragments(
         adata, list(ids), list(groupby), out_dir, prefix, suffix, selections, compression, compression_level,
     )
 

diff --git a/snapatac2-python/snapatac2/preprocessing/_basic.py b/snapatac2-python/snapatac2/preprocessing/_basic.py
@@ -118,13 +118,35 @@ def import_data(
 ) -> internal.AnnData:
     """Import data fragment files and compute basic QC metrics.
 
-    A fragment is defined as the sequencing output corresponding to one location in the genome.
-    If single-ended sequencing is performed, one read is considered a fragment.
-    If paired-ended sequencing is performed, one pair of reads is considered a fragment.
-    This function takes input fragment files and computes basic QC metrics, including
-    TSSe, number of unique fragments, duplication rate, fraction of mitochondrial DNA.
-    The fragments will be stored in `.obsm['fragment_single']` if they are single-ended.
-    Otherwise, they will be stored in `.obsm['fragment_paired']`.
+    A fragment refers to the sequence data originating from a distinct location
+    in the genome. In single-ended sequencing, one read equates to a fragment.
+    However, in paired-ended sequencing, a fragment is defined by a pair of reads.
+    This function is designed to handle, store, and process input files with
+    fragment data, further yielding a range of basic Quality Control (QC) metrics.
+    These metrics include TSSe (Transcription Start Site enrichment), the total
+    number of unique fragments, duplication rates, and the percentage of
+    mitochondrial DNA detected.
+
+    How fragments are stored is dependent on the sequencing approach utilized.
+    For single-ended sequencing, fragments are found in `.obsm['fragment_single']`.
+    In contrast, for paired-ended sequencing, they are located in
+    `.obsm['fragment_paired']`.
+
+    Diving deeper technically, the fragments are internally structured within a
+    Compressed Sparse Row (CSR) matrix. In this configuration, each row signifies
+    a specific cell, while each column represents a unique genomic position.
+    Fragment starting positions dictate the column indices. Matrix values
+    capture the lengths of the fragments for paired-end reads or the lengths of
+    the reads for single-ended scenarios. It's important to note that for
+    single-ended reads, the values are signed, with the sign providing information
+    on the fragment's strand orientation. Additionally, it is worth noting that
+    cells may harbor duplicate fragments, leading to the presence of duplicate
+    column indices within the matrix. As a result, the matrix deviates from
+    the standard CSR format, and it is not advisable to use the matrix for linear
+    algebra operations.
+    
+    .. image:: /_static/images/func+import_data.svg
+        :align: center
 
     Note
     ----
@@ -195,6 +217,7 @@ def import_data(
     See Also
     --------
     make_fragment_file
+    :func:`~snapatac2.ex.export_fragments` 
 
     Examples
     --------

diff --git a/snapatac2-python/src/export.rs b/snapatac2-python/src/export.rs
@@ -9,7 +9,7 @@ use std::{collections::{HashSet, HashMap}, path::PathBuf};
 use anyhow::Result;
 
 #[pyfunction]
-pub fn export_bed(
+pub fn export_fragments(
     anndata: AnnDataLike,
     barcodes: Vec<&str>,
     group_by: Vec<&str>,
@@ -22,7 +22,7 @@ pub fn export_bed(
 ) -> Result<HashMap<String, PathBuf>> {
     macro_rules! run {
         ($data:expr) => {
-            $data.export_bed(
+            $data.export_fragments(
                 Some(&barcodes), &group_by, selections, dir, prefix, suffix, compression, compression_level
             )
         }

diff --git a/snapatac2-python/src/lib.rs b/snapatac2-python/src/lib.rs
@@ -49,7 +49,7 @@ fn _snapatac2(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(preprocessing::add_frip, m)?)?;
     m.add_function(wrap_pyfunction!(preprocessing::fragment_size_distribution, m)?)?;
 
-    m.add_function(wrap_pyfunction!(export::export_bed, m)?)?;
+    m.add_function(wrap_pyfunction!(export::export_fragments, m)?)?;
     m.add_function(wrap_pyfunction!(export::export_bigwig, m)?)?;
 
     m.add_function(wrap_pyfunction!(call_peaks::export_tags, m)?)?;

diff --git a/snapatac2-python/tests/test_import.py b/snapatac2-python/tests/test_import.py
@@ -23,6 +23,6 @@ def test_in_memory():
     )
 
     data.obs['group'] = '1'
-    snap.ex.export_bed(data, groupby="group", suffix='.bed.gz')
+    snap.ex.export_fragments(data, groupby="group", suffix='.bed.gz')
 
     assert read_bed("1.bed.gz") == read_bed(fragment_file)
diff --git a/snapatac2-python/tests/test_pipeline.py b/snapatac2-python/tests/test_pipeline.py
@@ -48,7 +48,7 @@ def pipeline(data):
 
     snap.pp.make_gene_matrix(data, gene_anno=snap.genome.hg38)
 
-    snap.ex.export_bed(data, groupby="leiden")
+    snap.ex.export_fragments(data, groupby="leiden")
 
 def test_backed(tmp_path):
     fragment_file = snap.datasets.pbmc500(downsample=True)