diff --git a/docs/api/preprocessing.rst b/docs/api/preprocessing.rst index cbcec1ab..555790cb 100644 --- a/docs/api/preprocessing.rst +++ b/docs/api/preprocessing.rst @@ -11,6 +11,7 @@ BAM/Fragment file processing pp.make_fragment_file pp.import_data + pp.import_contacts pp.call_cells Matrix operation diff --git a/docs/changelog.md b/docs/changelog.md index 53b202e9..0f660d46 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -9,6 +9,7 @@ - `pp.make_fragment_file` can now work with 10X BAM files. - Add `pp.call_cells` to identify nonempty barcodes from raw data. - Add `pp.recipe_10x_metrics` to compute 10X metrics. + - Add `pp.import_contacts` to process scHi-C data. ## Release 2.6.4 (released May 28, 2024) diff --git a/docs/tutorials/hic.ipynb b/docs/tutorials/hic.ipynb new file mode 100644 index 00000000..f19d2b41 --- /dev/null +++ b/docs/tutorials/hic.ipynb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:548f2cf0db9e248448524ee84c11cf681c4c22889d6f40130294bd8e3b32906f +size 455424 diff --git a/snapatac2-core/src/preprocessing/count_data.rs b/snapatac2-core/src/preprocessing/count_data.rs index 5dd953c4..6dcc25cb 100644 --- a/snapatac2-core/src/preprocessing/count_data.rs +++ b/snapatac2-core/src/preprocessing/count_data.rs @@ -49,8 +49,6 @@ pub trait SnapData: AnnDataOp { fn get_count_iter(&self, chunk_size: usize) -> Result>>>; - fn contact_count_iter(&self, chunk_size: usize) -> Result>; - /// Read counts stored in the `X` matrix. fn read_chrom_values( &self, @@ -120,13 +118,6 @@ impl SnapData for AnnData { Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices)) } - fn contact_count_iter(&self, chunk_size: usize) -> Result> { - Ok(ContactMap::new( - self.read_chrom_sizes()?, - self.obsm().get_item_iter("contact", chunk_size).unwrap(), - )) - } - fn fragment_size_distribution(&self, max_size: usize) -> Result> { if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) { Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size)) @@ -155,17 +146,6 @@ impl SnapData for AnnDataSet { Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices)) } - fn contact_count_iter(&self, chunk_size: usize) -> Result> { - Ok(ContactMap::new( - self.read_chrom_sizes()?, - self.adatas() - .inner() - .get_obsm() - .get_item_iter("contact", chunk_size) - .unwrap(), - )) - } - fn fragment_size_distribution(&self, max_size: usize) -> Result> { if let Some(fragment) = self.adatas().inner().get_obsm().get_item_iter("fragment_paired", 500) { Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size)) diff --git a/snapatac2-core/src/preprocessing/count_data/coverage.rs b/snapatac2-core/src/preprocessing/count_data/coverage.rs index 1d12fd4a..c1e19d9c 100644 --- a/snapatac2-core/src/preprocessing/count_data/coverage.rs +++ b/snapatac2-core/src/preprocessing/count_data/coverage.rs @@ -371,7 +371,7 @@ pub struct ContactMap { impl ContactMap where - I: ExactSizeIterator, usize, usize)>, + I: Iterator>, { pub fn new(chrom_sizes: ChromSizes, coverage: I) -> Self { Self { @@ -394,21 +394,21 @@ where /// Output the raw coverage matrix. pub fn into_values( self, - ) -> impl ExactSizeIterator, usize, usize)> { + ) -> impl Iterator> { let index = self.get_gindex(); let ori_index = self.index; let genome_size = ori_index.len(); let new_size = index.len(); - self.coverage.map(move |(mat, i, j)| { + self.coverage.map(move |mat| { let new_mat = if self.resolution <= 1 { let (pattern, data) = mat.into_pattern_and_values(); let new_data = data .into_iter() - .map(|x| T::from_u8(x).unwrap()) + .map(|x| T::from_u32(x).unwrap()) .collect::>(); CsrMatrix::try_from_pattern_and_values(pattern, new_data).unwrap() } else { - let n = j - i; + let n = mat.nrows(); let vec = (0..n) .into_par_iter() .map(|k| { @@ -425,7 +425,7 @@ where let i1 = index.get_position_rev(locus1.chrom(), locus1.start()); let i2 = index.get_position_rev(locus2.chrom(), locus2.start()); let i = i1 * new_size + i2; - let val = T::from_u8(*val).unwrap(); + let val = T::from_u32(*val).unwrap(); *count.entry(i).or_insert(Zero::zero()) += val; }); count.into_iter().collect::>() @@ -434,7 +434,7 @@ where let (r, c, offset, ind, data) = to_csr_data(vec, new_size * new_size); CsrMatrix::try_from_csr_data(r,c,offset,ind, data).unwrap() }; - (new_mat, i, j) + new_mat }) } } diff --git a/snapatac2-core/src/preprocessing/count_data/import.rs b/snapatac2-core/src/preprocessing/count_data/import.rs index 55f08786..a296f0bd 100644 --- a/snapatac2-core/src/preprocessing/count_data/import.rs +++ b/snapatac2-core/src/preprocessing/count_data/import.rs @@ -193,6 +193,7 @@ pub fn import_contacts( anndata: &A, contacts: I, regions: &GenomeRegions, + bin_size: usize, chunk_size: usize, ) -> Result<()> where @@ -217,39 +218,40 @@ where .unwrap(), ); let mut scanned_barcodes = IndexSet::new(); - anndata.obsm().add_iter( - "contact", - contacts - .chunk_by(|x| x.barcode.clone()) - .into_iter() - .progress_with(spinner) - .chunks(chunk_size) - .into_iter() - .map(|chunk| { - let data: Vec> = chunk.map(|(barcode, x)| { - if !scanned_barcodes.insert(barcode.clone()) { - panic!("Please sort fragment file by barcodes"); - } - x.collect() - }).collect(); + let binding = contacts.chunk_by(|x| x.barcode.clone()); + let binding2 = binding.into_iter().progress_with(spinner).chunks(chunk_size); + let binding3 = binding2 + .into_iter() + .map(|chunk| { + let data: Vec> = chunk.map(|(barcode, x)| { + if !scanned_barcodes.insert(barcode.clone()) { + panic!("Please sort fragment file by barcodes"); + } + x.collect() + }).collect(); - let counts: Vec<_> = data - .into_par_iter() - .map(|x| { - let mut count = BTreeMap::new(); - x.into_iter().for_each(|c| { + let counts: Vec<_> = data + .into_par_iter() + .map(|x| { + let mut count = BTreeMap::new(); + x.into_iter().for_each(|c| { + if genome_index.contain_chrom(&c.chrom1) && genome_index.contain_chrom(&c.chrom2) { let pos1 = genome_index.get_position_rev(&c.chrom1, c.start1); let pos2 = genome_index.get_position_rev(&c.chrom2, c.start2); let i = pos1 * genome_size + pos2; count.entry(i).and_modify(|x| *x += c.count).or_insert(c.count); - }); - count.into_iter().collect::>() - }).collect(); + } + }); + count.into_iter().collect::>() + }).collect(); - let (r, c, offset, ind, data) = to_csr_data(counts, genome_size*genome_size); - CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap() - }), - )?; + let (r, c, offset, ind, data) = to_csr_data(counts, genome_size*genome_size); + CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap() + }); + let contact_map = super::ContactMap::new(chrom_sizes, binding3).with_resolution(bin_size); + + anndata.set_x_from_iter(contact_map.into_values::())?; + anndata.set_var_names(anndata.n_vars().into())?; anndata.uns().add( "reference_sequences", diff --git a/snapatac2-core/src/preprocessing/count_data/matrix.rs b/snapatac2-core/src/preprocessing/count_data/matrix.rs index 6475604e..25657e5c 100644 --- a/snapatac2-core/src/preprocessing/count_data/matrix.rs +++ b/snapatac2-core/src/preprocessing/count_data/matrix.rs @@ -197,4 +197,4 @@ where _ => panic!("id_type must be 'transcript' or 'gene'"), } Ok(()) -} +} \ No newline at end of file diff --git a/snapatac2-python/python/snapatac2/datasets.py b/snapatac2-python/python/snapatac2/datasets.py index 67fdf3b7..873b4ab2 100644 --- a/snapatac2-python/python/snapatac2/datasets.py +++ b/snapatac2-python/python/snapatac2/datasets.py @@ -40,7 +40,8 @@ def datasets(): # Genome files "gencode_v41_GRCh37.gff3.gz": "sha256:df96d3f0845127127cc87c729747ae39bc1f4c98de6180b112e71dda13592673", - "gencode_v41_GRCh37.fa.gz": "sha256:94330d402e53cf39a1fef6c132e2500121909c2dfdce95cc31d541404c0ed39e", + "gencode_v41_GRCh37.fa.gz": "sha256:ac73947d38df63ccb00724520a5c31d880c1ca423702ca7ccb7e6c2182a362d9", + #"gencode_v41_GRCh37.fa.gz": "sha256:94330d402e53cf39a1fef6c132e2500121909c2dfdce95cc31d541404c0ed39e", "gencode_v41_GRCh38.gff3.gz": "sha256:b82a655bdb736ca0e463a8f5d00242bedf10fa88ce9d651a017f135c7c4e9285", "gencode_v41_GRCh38.fa.gz": "sha256:4fac949d7021cbe11117ddab8ec1960004df423d672446cadfbc8cca8007e228", "gencode_vM25_GRCm38.gff3.gz": "sha256:e8ed48bef6a44fdf0db7c10a551d4398aa341318d00fbd9efd69530593106846", @@ -68,7 +69,8 @@ def datasets(): "Meuleman_2020.meme": "https://osf.io/download/6uet5/", "gencode_v41_GRCh37.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/gencode.v41lift37.basic.annotation.gff3.gz", - "gencode_v41_GRCh37.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz", + "gencode_v41_GRCh37.fa.gz": "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz", + #"gencode_v41_GRCh37.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz", "gencode_v41_GRCh38.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.basic.annotation.gff3.gz", "gencode_v41_GRCh38.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz", "gencode_vM25_GRCm38.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.basic.annotation.gff3.gz", diff --git a/snapatac2-python/python/snapatac2/preprocessing/_basic.py b/snapatac2-python/python/snapatac2/preprocessing/_basic.py index 9b2eb5c7..6569de0d 100644 --- a/snapatac2-python/python/snapatac2/preprocessing/_basic.py +++ b/snapatac2-python/python/snapatac2/preprocessing/_basic.py @@ -13,7 +13,7 @@ from snapatac2.genome import Genome __all__ = ['make_fragment_file', 'import_data', 'import_contacts', 'add_tile_matrix', - 'make_peak_matrix', 'call_cells', 'filter_cells', 'select_features', 'make_gene_matrix' + 'make_peak_matrix', 'call_cells', 'filter_cells', 'select_features', 'make_gene_matrix', ] def make_fragment_file( @@ -346,7 +346,8 @@ def import_contacts( genome: Genome | None = None, chrom_size: dict[str, int] | None = None, sorted_by_barcode: bool = True, - chunk_size: int = 2000, + bin_size: int = 500000, + chunk_size: int = 200, tempdir: Path | None = None, backend: Literal['hdf5'] = 'hdf5', ) -> internal.AnnData: @@ -374,6 +375,8 @@ def import_contacts( If `sorted_by_barcode == True`, this function makes use of small fixed amout of memory. If `sorted_by_barcode == False` and `low_memory == False`, all data will be kept in memory. See `low_memory` for more details. + bin_size + The size of consecutive genomic regions used to record the counts. chunk_size Increasing the chunk_size speeds up I/O but uses more memory. tempdir @@ -395,7 +398,7 @@ def import_contacts( adata = AnnData() if file is None else internal.AnnData(filename=file, backend=backend) internal.import_contacts( - adata, contact_file, chrom_size, sorted_by_barcode, chunk_size, tempdir + adata, contact_file, chrom_size, sorted_by_barcode, bin_size, chunk_size, tempdir ) return adata diff --git a/snapatac2-python/src/preprocessing.rs b/snapatac2-python/src/preprocessing.rs index af6b7dd7..c8ac6a66 100644 --- a/snapatac2-python/src/preprocessing.rs +++ b/snapatac2-python/src/preprocessing.rs @@ -144,6 +144,7 @@ pub(crate) fn import_contacts( contact_file: PathBuf, chrom_size: BTreeMap, fragment_is_sorted_by_name: bool, + bin_size: usize, chunk_size: usize, tempdir: Option, ) -> Result<()> @@ -171,7 +172,7 @@ pub(crate) fn import_contacts( macro_rules! run { ($data:expr) => { - preprocessing::import_contacts($data, sorted_contacts, &chrom_sizes, chunk_size)? + preprocessing::import_contacts($data, sorted_contacts, &chrom_sizes, bin_size, chunk_size)? }; } @@ -399,4 +400,4 @@ pub(crate) fn fragment_size_distribution( } crate::with_anndata!(&anndata, run) -} +} \ No newline at end of file diff --git a/snapatac2-python/src/utils/anndata.rs b/snapatac2-python/src/utils/anndata.rs index af106253..b306b473 100644 --- a/snapatac2-python/src/utils/anndata.rs +++ b/snapatac2-python/src/utils/anndata.rs @@ -10,7 +10,7 @@ use pyanndata::anndata::memory; use pyanndata::{AnnData, AnnDataSet}; use pyo3::prelude::*; -use snapatac2_core::preprocessing::{qc, SnapData, GenomeCount, ContactMap, count_data::FragmentType}; +use snapatac2_core::preprocessing::{qc, SnapData, GenomeCount, count_data::FragmentType}; pub struct PyAnnData<'py>(memory::PyAnnData<'py>); @@ -157,16 +157,6 @@ impl<'py> SnapData for PyAnnData<'py> { Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices)) } - fn contact_count_iter( - &self, chunk_size: usize - ) -> Result> - { - Ok(ContactMap::new( - self.read_chrom_sizes()?, - self.obsm().get_item_iter("contact", chunk_size).expect("'contact' not found in obsm"), - )) - } - fn fragment_size_distribution(&self, max_size: usize) -> Result> { if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) { Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))