Skip to content

Commit

Permalink
add import_contacts
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Aug 26, 2024
1 parent 69fd3c0 commit 83eb5be
Show file tree
Hide file tree
Showing 11 changed files with 56 additions and 73 deletions.
1 change: 1 addition & 0 deletions docs/api/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ BAM/Fragment file processing

pp.make_fragment_file
pp.import_data
pp.import_contacts
pp.call_cells

Matrix operation
Expand Down
1 change: 1 addition & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- `pp.make_fragment_file` can now work with 10X BAM files.
- Add `pp.call_cells` to identify nonempty barcodes from raw data.
- Add `pp.recipe_10x_metrics` to compute 10X metrics.
- Add `pp.import_contacts` to process scHi-C data.

## Release 2.6.4 (released May 28, 2024)

Expand Down
3 changes: 3 additions & 0 deletions docs/tutorials/hic.ipynb
Git LFS file not shown
20 changes: 0 additions & 20 deletions snapatac2-core/src/preprocessing/count_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ pub trait SnapData: AnnDataOp {
fn get_count_iter(&self, chunk_size: usize) ->
Result<GenomeCount<Box<dyn ExactSizeIterator<Item = (FragmentType, usize, usize)>>>>;

fn contact_count_iter(&self, chunk_size: usize) -> Result<ContactMap<Self::CountIter>>;

/// Read counts stored in the `X` matrix.
fn read_chrom_values(
&self,
Expand Down Expand Up @@ -120,13 +118,6 @@ impl<B: Backend> SnapData for AnnData<B> {
Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices))
}

fn contact_count_iter(&self, chunk_size: usize) -> Result<ContactMap<Self::CountIter>> {
Ok(ContactMap::new(
self.read_chrom_sizes()?,
self.obsm().get_item_iter("contact", chunk_size).unwrap(),
))
}

fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) {
Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))
Expand Down Expand Up @@ -155,17 +146,6 @@ impl<B: Backend> SnapData for AnnDataSet<B> {
Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices))
}

fn contact_count_iter(&self, chunk_size: usize) -> Result<ContactMap<Self::CountIter>> {
Ok(ContactMap::new(
self.read_chrom_sizes()?,
self.adatas()
.inner()
.get_obsm()
.get_item_iter("contact", chunk_size)
.unwrap(),
))
}

fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
if let Some(fragment) = self.adatas().inner().get_obsm().get_item_iter("fragment_paired", 500) {
Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))
Expand Down
14 changes: 7 additions & 7 deletions snapatac2-core/src/preprocessing/count_data/coverage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ pub struct ContactMap<I> {

impl<I> ContactMap<I>
where
I: ExactSizeIterator<Item = (CsrMatrix<u8>, usize, usize)>,
I: Iterator<Item = CsrMatrix<u32>>,
{
pub fn new(chrom_sizes: ChromSizes, coverage: I) -> Self {
Self {
Expand All @@ -394,21 +394,21 @@ where
/// Output the raw coverage matrix.
pub fn into_values<T: Zero + FromPrimitive + AddAssign + Send>(
self,
) -> impl ExactSizeIterator<Item = (CsrMatrix<T>, usize, usize)> {
) -> impl Iterator<Item = CsrMatrix<T>> {
let index = self.get_gindex();
let ori_index = self.index;
let genome_size = ori_index.len();
let new_size = index.len();
self.coverage.map(move |(mat, i, j)| {
self.coverage.map(move |mat| {
let new_mat = if self.resolution <= 1 {
let (pattern, data) = mat.into_pattern_and_values();
let new_data = data
.into_iter()
.map(|x| T::from_u8(x).unwrap())
.map(|x| T::from_u32(x).unwrap())
.collect::<Vec<_>>();
CsrMatrix::try_from_pattern_and_values(pattern, new_data).unwrap()
} else {
let n = j - i;
let n = mat.nrows();
let vec = (0..n)
.into_par_iter()
.map(|k| {
Expand All @@ -425,7 +425,7 @@ where
let i1 = index.get_position_rev(locus1.chrom(), locus1.start());
let i2 = index.get_position_rev(locus2.chrom(), locus2.start());
let i = i1 * new_size + i2;
let val = T::from_u8(*val).unwrap();
let val = T::from_u32(*val).unwrap();
*count.entry(i).or_insert(Zero::zero()) += val;
});
count.into_iter().collect::<Vec<_>>()
Expand All @@ -434,7 +434,7 @@ where
let (r, c, offset, ind, data) = to_csr_data(vec, new_size * new_size);
CsrMatrix::try_from_csr_data(r,c,offset,ind, data).unwrap()
};
(new_mat, i, j)
new_mat
})
}
}
56 changes: 29 additions & 27 deletions snapatac2-core/src/preprocessing/count_data/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ pub fn import_contacts<A, B, I>(
anndata: &A,
contacts: I,
regions: &GenomeRegions<B>,
bin_size: usize,
chunk_size: usize,
) -> Result<()>
where
Expand All @@ -217,39 +218,40 @@ where
.unwrap(),
);
let mut scanned_barcodes = IndexSet::new();
anndata.obsm().add_iter(
"contact",
contacts
.chunk_by(|x| x.barcode.clone())
.into_iter()
.progress_with(spinner)
.chunks(chunk_size)
.into_iter()
.map(|chunk| {
let data: Vec<Vec<Contact>> = chunk.map(|(barcode, x)| {
if !scanned_barcodes.insert(barcode.clone()) {
panic!("Please sort fragment file by barcodes");
}
x.collect()
}).collect();
let binding = contacts.chunk_by(|x| x.barcode.clone());
let binding2 = binding.into_iter().progress_with(spinner).chunks(chunk_size);
let binding3 = binding2
.into_iter()
.map(|chunk| {
let data: Vec<Vec<Contact>> = chunk.map(|(barcode, x)| {
if !scanned_barcodes.insert(barcode.clone()) {
panic!("Please sort fragment file by barcodes");
}
x.collect()
}).collect();

let counts: Vec<_> = data
.into_par_iter()
.map(|x| {
let mut count = BTreeMap::new();
x.into_iter().for_each(|c| {
let counts: Vec<_> = data
.into_par_iter()
.map(|x| {
let mut count = BTreeMap::new();
x.into_iter().for_each(|c| {
if genome_index.contain_chrom(&c.chrom1) && genome_index.contain_chrom(&c.chrom2) {
let pos1 = genome_index.get_position_rev(&c.chrom1, c.start1);
let pos2 = genome_index.get_position_rev(&c.chrom2, c.start2);
let i = pos1 * genome_size + pos2;
count.entry(i).and_modify(|x| *x += c.count).or_insert(c.count);
});
count.into_iter().collect::<Vec<_>>()
}).collect();
}
});
count.into_iter().collect::<Vec<_>>()
}).collect();

let (r, c, offset, ind, data) = to_csr_data(counts, genome_size*genome_size);
CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap()
}),
)?;
let (r, c, offset, ind, data) = to_csr_data(counts, genome_size*genome_size);
CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap()
});
let contact_map = super::ContactMap::new(chrom_sizes, binding3).with_resolution(bin_size);

anndata.set_x_from_iter(contact_map.into_values::<u32>())?;
anndata.set_var_names(anndata.n_vars().into())?;

anndata.uns().add(
"reference_sequences",
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/src/preprocessing/count_data/matrix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,4 @@ where
_ => panic!("id_type must be 'transcript' or 'gene'"),
}
Ok(())
}
}
6 changes: 4 additions & 2 deletions snapatac2-python/python/snapatac2/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def datasets():

# Genome files
"gencode_v41_GRCh37.gff3.gz": "sha256:df96d3f0845127127cc87c729747ae39bc1f4c98de6180b112e71dda13592673",
"gencode_v41_GRCh37.fa.gz": "sha256:94330d402e53cf39a1fef6c132e2500121909c2dfdce95cc31d541404c0ed39e",
"gencode_v41_GRCh37.fa.gz": "sha256:ac73947d38df63ccb00724520a5c31d880c1ca423702ca7ccb7e6c2182a362d9",
#"gencode_v41_GRCh37.fa.gz": "sha256:94330d402e53cf39a1fef6c132e2500121909c2dfdce95cc31d541404c0ed39e",
"gencode_v41_GRCh38.gff3.gz": "sha256:b82a655bdb736ca0e463a8f5d00242bedf10fa88ce9d651a017f135c7c4e9285",
"gencode_v41_GRCh38.fa.gz": "sha256:4fac949d7021cbe11117ddab8ec1960004df423d672446cadfbc8cca8007e228",
"gencode_vM25_GRCm38.gff3.gz": "sha256:e8ed48bef6a44fdf0db7c10a551d4398aa341318d00fbd9efd69530593106846",
Expand Down Expand Up @@ -68,7 +69,8 @@ def datasets():
"Meuleman_2020.meme": "https://osf.io/download/6uet5/",

"gencode_v41_GRCh37.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/gencode.v41lift37.basic.annotation.gff3.gz",
"gencode_v41_GRCh37.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz",
"gencode_v41_GRCh37.fa.gz": "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz",
#"gencode_v41_GRCh37.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz",
"gencode_v41_GRCh38.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.basic.annotation.gff3.gz",
"gencode_v41_GRCh38.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz",
"gencode_vM25_GRCm38.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.basic.annotation.gff3.gz",
Expand Down
9 changes: 6 additions & 3 deletions snapatac2-python/python/snapatac2/preprocessing/_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from snapatac2.genome import Genome

__all__ = ['make_fragment_file', 'import_data', 'import_contacts', 'add_tile_matrix',
'make_peak_matrix', 'call_cells', 'filter_cells', 'select_features', 'make_gene_matrix'
'make_peak_matrix', 'call_cells', 'filter_cells', 'select_features', 'make_gene_matrix',
]

def make_fragment_file(
Expand Down Expand Up @@ -346,7 +346,8 @@ def import_contacts(
genome: Genome | None = None,
chrom_size: dict[str, int] | None = None,
sorted_by_barcode: bool = True,
chunk_size: int = 2000,
bin_size: int = 500000,
chunk_size: int = 200,
tempdir: Path | None = None,
backend: Literal['hdf5'] = 'hdf5',
) -> internal.AnnData:
Expand Down Expand Up @@ -374,6 +375,8 @@ def import_contacts(
If `sorted_by_barcode == True`, this function makes use of small fixed amout of
memory. If `sorted_by_barcode == False` and `low_memory == False`,
all data will be kept in memory. See `low_memory` for more details.
bin_size
The size of consecutive genomic regions used to record the counts.
chunk_size
Increasing the chunk_size speeds up I/O but uses more memory.
tempdir
Expand All @@ -395,7 +398,7 @@ def import_contacts(

adata = AnnData() if file is None else internal.AnnData(filename=file, backend=backend)
internal.import_contacts(
adata, contact_file, chrom_size, sorted_by_barcode, chunk_size, tempdir
adata, contact_file, chrom_size, sorted_by_barcode, bin_size, chunk_size, tempdir
)
return adata

Expand Down
5 changes: 3 additions & 2 deletions snapatac2-python/src/preprocessing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ pub(crate) fn import_contacts(
contact_file: PathBuf,
chrom_size: BTreeMap<String, u64>,
fragment_is_sorted_by_name: bool,
bin_size: usize,
chunk_size: usize,
tempdir: Option<PathBuf>,
) -> Result<()>
Expand Down Expand Up @@ -171,7 +172,7 @@ pub(crate) fn import_contacts(

macro_rules! run {
($data:expr) => {
preprocessing::import_contacts($data, sorted_contacts, &chrom_sizes, chunk_size)?
preprocessing::import_contacts($data, sorted_contacts, &chrom_sizes, bin_size, chunk_size)?
};
}

Expand Down Expand Up @@ -399,4 +400,4 @@ pub(crate) fn fragment_size_distribution(
}

crate::with_anndata!(&anndata, run)
}
}
12 changes: 1 addition & 11 deletions snapatac2-python/src/utils/anndata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use pyanndata::anndata::memory;
use pyanndata::{AnnData, AnnDataSet};
use pyo3::prelude::*;

use snapatac2_core::preprocessing::{qc, SnapData, GenomeCount, ContactMap, count_data::FragmentType};
use snapatac2_core::preprocessing::{qc, SnapData, GenomeCount, count_data::FragmentType};

pub struct PyAnnData<'py>(memory::PyAnnData<'py>);

Expand Down Expand Up @@ -157,16 +157,6 @@ impl<'py> SnapData for PyAnnData<'py> {
Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices))
}

fn contact_count_iter(
&self, chunk_size: usize
) -> Result<ContactMap<Self::CountIter>>
{
Ok(ContactMap::new(
self.read_chrom_sizes()?,
self.obsm().get_item_iter("contact", chunk_size).expect("'contact' not found in obsm"),
))
}

fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) {
Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))
Expand Down

0 comments on commit 83eb5be

Please sign in to comment.