Skip to content

Commit

Permalink
rename export_bed to export_fragments
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 23, 2023
1 parent f022a41 commit f842368
Show file tree
Hide file tree
Showing 12 changed files with 70 additions and 21 deletions.
3 changes: 3 additions & 0 deletions docs/_static/images/func+import_data.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/api/export.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ Exporting: `ex`
.. autosummary::
:toctree: _autosummary

ex.export_bed
ex.export_fragments
ex.export_bigwig
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ Release Notes

## Development version (unreleased)

### Function renaming:

- `ex.export_bed` is renamed to `ex.export_fragments`.

### Features:

- Add zstandard file support at various places.
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ keywords = ["single-cell", "biology"]
[dependencies]
anndata = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "730eea8495ba1884ea50308e5f03ab0cb60f9ae8" }
anyhow = "1.0"
bigtools = "0.2"
bigtools = { version = "0.2", features = ["read", "write"] }
bincode = "1.3"
bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "d06f2b4ff189c64bc0d3f7054f4ee8d0797368ca" }
extsort = "0.4"
Expand Down
20 changes: 15 additions & 5 deletions snapatac2-core/src/export.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{preprocessing::count_data::{SnapData, GenomeCoverage, CoverageType}, utils::open_file_for_write};
use crate::{preprocessing::{count_data::{SnapData, GenomeCoverage, CoverageType, ChromSizes}, Fragment}, utils::open_file_for_write};

use anyhow::{Context, Result, ensure};
use itertools::Itertools;
Expand All @@ -8,15 +8,15 @@ use std::{
path::{Path, PathBuf}, collections::{BTreeMap, HashMap, HashSet},
};
use rayon::iter::{ParallelBridge, ParallelIterator};
use bed_utils::bed::{BEDLike, BedGraph, merge_sorted_bed_with};
use bed_utils::bed::{BEDLike, BedGraph, merge_sorted_bed_with, GenomicRange, tree::{GenomeRegions, SparseBinnedCoverage}};
use bigtools::{bbi::bigwigwrite::BigWigWrite, bed::bedparser::BedParser};
use futures::executor::ThreadPool;
use indicatif::{ProgressIterator, style::ProgressStyle};

impl<T> Exporter for T where T: SnapData {}

pub trait Exporter: SnapData {
fn export_bed<P: AsRef<Path>>(
fn export_fragments<P: AsRef<Path>>(
&self,
barcodes: Option<&Vec<&str>>,
group_by: &Vec<&str>,
Expand Down Expand Up @@ -181,8 +181,18 @@ where
}).collect()
}

//fn create_bedgraph_from_fragments() {
//}
fn create_bedgraph_from_fragments<I>(fragments: I, chrom_sizes: &ChromSizes, bin_size: u64)
where
I: Iterator<Item = Fragment>,
{
let genome: GenomeRegions<GenomicRange> = chrom_sizes.into_iter()
.map(|(k, v)| GenomicRange::new(k, 0, *v)).collect();
let mut coverage = SparseBinnedCoverage::new(&genome, bin_size);
fragments.for_each(|frag| frag.to_reads().into_iter().for_each(|read|
coverage.insert(&read, 1.0)
));
todo!()
}

/// Create a bigwig file from BedGraph records.
fn create_bigwig_from_bedgraph<P: AsRef<Path>>(
Expand Down
9 changes: 9 additions & 0 deletions snapatac2-core/src/preprocessing/count_data/genome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,15 @@ where
}
}

impl<'a> IntoIterator for &'a ChromSizes {
type Item = (&'a String, &'a u64);
type IntoIter = indexmap::map::Iter<'a, String, u64>;

fn into_iter(self) -> Self::IntoIter {
self.0.iter()
}
}

impl IntoIterator for ChromSizes {
type Item = (String, u64);
type IntoIter = indexmap::map::IntoIter<String, u64>;
Expand Down
4 changes: 2 additions & 2 deletions snapatac2-python/snapatac2/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import snapatac2._snapatac2 as internal

def export_bed(
def export_fragments(
adata: internal.AnnData | internal.AnnDataSet,
groupby: str | list[str],
selections: list[str] | None = None,
Expand Down Expand Up @@ -66,7 +66,7 @@ def export_bed(
else:
raise ValueError("Compression type must be: gzip or zstandard.")

return internal.export_bed(
return internal.export_fragments(
adata, list(ids), list(groupby), out_dir, prefix, suffix, selections, compression, compression_level,
)

Expand Down
37 changes: 30 additions & 7 deletions snapatac2-python/snapatac2/preprocessing/_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,35 @@ def import_data(
) -> internal.AnnData:
"""Import data fragment files and compute basic QC metrics.
A fragment is defined as the sequencing output corresponding to one location in the genome.
If single-ended sequencing is performed, one read is considered a fragment.
If paired-ended sequencing is performed, one pair of reads is considered a fragment.
This function takes input fragment files and computes basic QC metrics, including
TSSe, number of unique fragments, duplication rate, fraction of mitochondrial DNA.
The fragments will be stored in `.obsm['fragment_single']` if they are single-ended.
Otherwise, they will be stored in `.obsm['fragment_paired']`.
A fragment refers to the sequence data originating from a distinct location
in the genome. In single-ended sequencing, one read equates to a fragment.
However, in paired-ended sequencing, a fragment is defined by a pair of reads.
This function is designed to handle, store, and process input files with
fragment data, further yielding a range of basic Quality Control (QC) metrics.
These metrics include TSSe (Transcription Start Site enrichment), the total
number of unique fragments, duplication rates, and the percentage of
mitochondrial DNA detected.
How fragments are stored is dependent on the sequencing approach utilized.
For single-ended sequencing, fragments are found in `.obsm['fragment_single']`.
In contrast, for paired-ended sequencing, they are located in
`.obsm['fragment_paired']`.
Diving deeper technically, the fragments are internally structured within a
Compressed Sparse Row (CSR) matrix. In this configuration, each row signifies
a specific cell, while each column represents a unique genomic position.
Fragment starting positions dictate the column indices. Matrix values
capture the lengths of the fragments for paired-end reads or the lengths of
the reads for single-ended scenarios. It's important to note that for
single-ended reads, the values are signed, with the sign providing information
on the fragment's strand orientation. Additionally, it is worth noting that
cells may harbor duplicate fragments, leading to the presence of duplicate
column indices within the matrix. As a result, the matrix deviates from
the standard CSR format, and it is not advisable to use the matrix for linear
algebra operations.
.. image:: /_static/images/func+import_data.svg
:align: center
Note
----
Expand Down Expand Up @@ -195,6 +217,7 @@ def import_data(
See Also
--------
make_fragment_file
:func:`~snapatac2.ex.export_fragments`
Examples
--------
Expand Down
4 changes: 2 additions & 2 deletions snapatac2-python/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use std::{collections::{HashSet, HashMap}, path::PathBuf};
use anyhow::Result;

#[pyfunction]
pub fn export_bed(
pub fn export_fragments(
anndata: AnnDataLike,
barcodes: Vec<&str>,
group_by: Vec<&str>,
Expand All @@ -22,7 +22,7 @@ pub fn export_bed(
) -> Result<HashMap<String, PathBuf>> {
macro_rules! run {
($data:expr) => {
$data.export_bed(
$data.export_fragments(
Some(&barcodes), &group_by, selections, dir, prefix, suffix, compression, compression_level
)
}
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ fn _snapatac2(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(preprocessing::add_frip, m)?)?;
m.add_function(wrap_pyfunction!(preprocessing::fragment_size_distribution, m)?)?;

m.add_function(wrap_pyfunction!(export::export_bed, m)?)?;
m.add_function(wrap_pyfunction!(export::export_fragments, m)?)?;
m.add_function(wrap_pyfunction!(export::export_bigwig, m)?)?;

m.add_function(wrap_pyfunction!(call_peaks::export_tags, m)?)?;
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-python/tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ def test_in_memory():
)

data.obs['group'] = '1'
snap.ex.export_bed(data, groupby="group", suffix='.bed.gz')
snap.ex.export_fragments(data, groupby="group", suffix='.bed.gz')

assert read_bed("1.bed.gz") == read_bed(fragment_file)
2 changes: 1 addition & 1 deletion snapatac2-python/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def pipeline(data):

snap.pp.make_gene_matrix(data, gene_anno=snap.genome.hg38)

snap.ex.export_bed(data, groupby="leiden")
snap.ex.export_fragments(data, groupby="leiden")

def test_backed(tmp_path):
fragment_file = snap.datasets.pbmc500(downsample=True)
Expand Down

0 comments on commit f842368

Please sign in to comment.