add import_contacts

kaizhang · Aug 26, 2024 · 83eb5be · 83eb5be
1 parent 69fd3c0
commit 83eb5be
Show file tree

Hide file tree

Showing 11 changed files with 56 additions and 73 deletions.
diff --git a/docs/api/preprocessing.rst b/docs/api/preprocessing.rst
@@ -11,6 +11,7 @@ BAM/Fragment file processing
 
     pp.make_fragment_file
     pp.import_data
+    pp.import_contacts
     pp.call_cells
 
 Matrix operation

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -9,6 +9,7 @@
   - `pp.make_fragment_file` can now work with 10X BAM files.
   - Add `pp.call_cells` to identify nonempty barcodes from raw data.
   - Add `pp.recipe_10x_metrics` to compute 10X metrics.
+  - Add `pp.import_contacts` to process scHi-C data.
 
 ## Release 2.6.4 (released May 28, 2024)
 

diff --git a/docs/tutorials/hic.ipynb b/docs/tutorials/hic.ipynb
diff --git a/snapatac2-core/src/preprocessing/count_data.rs b/snapatac2-core/src/preprocessing/count_data.rs
@@ -49,8 +49,6 @@ pub trait SnapData: AnnDataOp {
     fn get_count_iter(&self, chunk_size: usize) ->
         Result<GenomeCount<Box<dyn ExactSizeIterator<Item = (FragmentType, usize, usize)>>>>;
 
-    fn contact_count_iter(&self, chunk_size: usize) -> Result<ContactMap<Self::CountIter>>;
-
     /// Read counts stored in the `X` matrix.
     fn read_chrom_values(
         &self,
@@ -120,13 +118,6 @@ impl<B: Backend> SnapData for AnnData<B> {
         Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices))
     }
 
-    fn contact_count_iter(&self, chunk_size: usize) -> Result<ContactMap<Self::CountIter>> {
-        Ok(ContactMap::new(
-            self.read_chrom_sizes()?,
-            self.obsm().get_item_iter("contact", chunk_size).unwrap(),
-        ))
-    }
-
     fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
         if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) {
             Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))
@@ -155,17 +146,6 @@ impl<B: Backend> SnapData for AnnDataSet<B> {
         Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices))
     }
 
-    fn contact_count_iter(&self, chunk_size: usize) -> Result<ContactMap<Self::CountIter>> {
-        Ok(ContactMap::new(
-            self.read_chrom_sizes()?,
-            self.adatas()
-                .inner()
-                .get_obsm()
-                .get_item_iter("contact", chunk_size)
-                .unwrap(),
-        ))
-    }
-
     fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
         if let Some(fragment) = self.adatas().inner().get_obsm().get_item_iter("fragment_paired", 500) {
             Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))

diff --git a/snapatac2-core/src/preprocessing/count_data/coverage.rs b/snapatac2-core/src/preprocessing/count_data/coverage.rs
@@ -371,7 +371,7 @@ pub struct ContactMap<I> {
 
 impl<I> ContactMap<I>
 where
-    I: ExactSizeIterator<Item = (CsrMatrix<u8>, usize, usize)>,
+    I: Iterator<Item = CsrMatrix<u32>>,
 {
     pub fn new(chrom_sizes: ChromSizes, coverage: I) -> Self {
         Self {
@@ -394,21 +394,21 @@ where
     /// Output the raw coverage matrix.
     pub fn into_values<T: Zero + FromPrimitive + AddAssign + Send>(
         self,
-    ) -> impl ExactSizeIterator<Item = (CsrMatrix<T>, usize, usize)> {
+    ) -> impl Iterator<Item = CsrMatrix<T>> {
         let index = self.get_gindex();
         let ori_index = self.index;
         let genome_size = ori_index.len();
         let new_size = index.len();
-        self.coverage.map(move |(mat, i, j)| {
+        self.coverage.map(move |mat| {
             let new_mat = if self.resolution <= 1 {
                 let (pattern, data) = mat.into_pattern_and_values();
                 let new_data = data
                     .into_iter()
-                    .map(|x| T::from_u8(x).unwrap())
+                    .map(|x| T::from_u32(x).unwrap())
                     .collect::<Vec<_>>();
                 CsrMatrix::try_from_pattern_and_values(pattern, new_data).unwrap()
             } else {
-                let n = j - i;
+                let n = mat.nrows();
                 let vec = (0..n)
                     .into_par_iter()
                     .map(|k| {
@@ -425,7 +425,7 @@ where
                                 let i1 = index.get_position_rev(locus1.chrom(), locus1.start());
                                 let i2 = index.get_position_rev(locus2.chrom(), locus2.start());
                                 let i = i1 * new_size + i2;
-                                let val = T::from_u8(*val).unwrap();
+                                let val = T::from_u32(*val).unwrap();
                                 *count.entry(i).or_insert(Zero::zero()) += val;
                             });
                         count.into_iter().collect::<Vec<_>>()
@@ -434,7 +434,7 @@ where
                 let (r, c, offset, ind, data) = to_csr_data(vec, new_size * new_size);
                 CsrMatrix::try_from_csr_data(r,c,offset,ind, data).unwrap()
             };
-            (new_mat, i, j)
+            new_mat
         })
     }
 }
diff --git a/snapatac2-core/src/preprocessing/count_data/import.rs b/snapatac2-core/src/preprocessing/count_data/import.rs
@@ -193,6 +193,7 @@ pub fn import_contacts<A, B, I>(
     anndata: &A,
     contacts: I,
     regions: &GenomeRegions<B>,
+    bin_size: usize,
     chunk_size: usize,
 ) -> Result<()>
 where
@@ -217,39 +218,40 @@ where
             .unwrap(),
         );
     let mut scanned_barcodes = IndexSet::new();
-    anndata.obsm().add_iter(
-        "contact",
-        contacts
-            .chunk_by(|x| x.barcode.clone())
-            .into_iter()
-            .progress_with(spinner)
-            .chunks(chunk_size)
-            .into_iter()
-            .map(|chunk| {
-                let data: Vec<Vec<Contact>> = chunk.map(|(barcode, x)| {
-                    if !scanned_barcodes.insert(barcode.clone()) {
-                        panic!("Please sort fragment file by barcodes");
-                    }
-                    x.collect()
-                }).collect();
+    let binding = contacts.chunk_by(|x| x.barcode.clone());
+    let binding2 = binding.into_iter().progress_with(spinner).chunks(chunk_size);
+    let binding3 = binding2
+        .into_iter()
+        .map(|chunk| {
+            let data: Vec<Vec<Contact>> = chunk.map(|(barcode, x)| {
+                if !scanned_barcodes.insert(barcode.clone()) {
+                    panic!("Please sort fragment file by barcodes");
+                }
+                x.collect()
+            }).collect();
 
-                let counts: Vec<_> = data
-                    .into_par_iter()
-                    .map(|x| {
-                        let mut count = BTreeMap::new();
-                        x.into_iter().for_each(|c| {
+            let counts: Vec<_> = data
+                .into_par_iter()
+                .map(|x| {
+                    let mut count = BTreeMap::new();
+                    x.into_iter().for_each(|c| {
+                        if genome_index.contain_chrom(&c.chrom1) && genome_index.contain_chrom(&c.chrom2) {
                             let pos1 = genome_index.get_position_rev(&c.chrom1, c.start1);
                             let pos2 = genome_index.get_position_rev(&c.chrom2, c.start2);
                             let i = pos1 * genome_size + pos2; 
                             count.entry(i).and_modify(|x| *x += c.count).or_insert(c.count);
-                        });
-                        count.into_iter().collect::<Vec<_>>()
-                    }).collect();
+                        }
+                    });
+                    count.into_iter().collect::<Vec<_>>()
+                }).collect();
 
-                let (r, c, offset, ind, data) = to_csr_data(counts, genome_size*genome_size);
-                CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap()
-            }),
-    )?;
+            let (r, c, offset, ind, data) = to_csr_data(counts, genome_size*genome_size);
+            CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap()
+        });
+    let contact_map = super::ContactMap::new(chrom_sizes, binding3).with_resolution(bin_size);
+
+    anndata.set_x_from_iter(contact_map.into_values::<u32>())?;
+    anndata.set_var_names(anndata.n_vars().into())?;
 
     anndata.uns().add(
         "reference_sequences",

diff --git a/snapatac2-core/src/preprocessing/count_data/matrix.rs b/snapatac2-core/src/preprocessing/count_data/matrix.rs
@@ -197,4 +197,4 @@ where
         _ => panic!("id_type must be 'transcript' or 'gene'"),
     }
     Ok(())
-}
+}
diff --git a/snapatac2-python/python/snapatac2/datasets.py b/snapatac2-python/python/snapatac2/datasets.py
@@ -40,7 +40,8 @@ def datasets():
 
                 # Genome files
                 "gencode_v41_GRCh37.gff3.gz": "sha256:df96d3f0845127127cc87c729747ae39bc1f4c98de6180b112e71dda13592673",
-                "gencode_v41_GRCh37.fa.gz": "sha256:94330d402e53cf39a1fef6c132e2500121909c2dfdce95cc31d541404c0ed39e",
+                "gencode_v41_GRCh37.fa.gz": "sha256:ac73947d38df63ccb00724520a5c31d880c1ca423702ca7ccb7e6c2182a362d9",
+                #"gencode_v41_GRCh37.fa.gz": "sha256:94330d402e53cf39a1fef6c132e2500121909c2dfdce95cc31d541404c0ed39e",
                 "gencode_v41_GRCh38.gff3.gz": "sha256:b82a655bdb736ca0e463a8f5d00242bedf10fa88ce9d651a017f135c7c4e9285",
                 "gencode_v41_GRCh38.fa.gz": "sha256:4fac949d7021cbe11117ddab8ec1960004df423d672446cadfbc8cca8007e228",
                 "gencode_vM25_GRCm38.gff3.gz": "sha256:e8ed48bef6a44fdf0db7c10a551d4398aa341318d00fbd9efd69530593106846",
@@ -68,7 +69,8 @@ def datasets():
                 "Meuleman_2020.meme": "https://osf.io/download/6uet5/",
 
                 "gencode_v41_GRCh37.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/gencode.v41lift37.basic.annotation.gff3.gz",
-                "gencode_v41_GRCh37.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz",
+                "gencode_v41_GRCh37.fa.gz": "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz",
+                #"gencode_v41_GRCh37.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh37_mapping/GRCh37.primary_assembly.genome.fa.gz",
                 "gencode_v41_GRCh38.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.basic.annotation.gff3.gz",
                 "gencode_v41_GRCh38.fa.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz",
                 "gencode_vM25_GRCm38.gff3.gz": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.basic.annotation.gff3.gz",

diff --git a/snapatac2-python/python/snapatac2/preprocessing/_basic.py b/snapatac2-python/python/snapatac2/preprocessing/_basic.py
@@ -13,7 +13,7 @@
 from snapatac2.genome import Genome
 
 __all__ = ['make_fragment_file', 'import_data', 'import_contacts', 'add_tile_matrix',
-           'make_peak_matrix', 'call_cells', 'filter_cells', 'select_features', 'make_gene_matrix'
+           'make_peak_matrix', 'call_cells', 'filter_cells', 'select_features', 'make_gene_matrix',
 ]
 
 def make_fragment_file(
@@ -346,7 +346,8 @@ def import_contacts(
     genome: Genome | None = None,
     chrom_size: dict[str, int] | None = None,
     sorted_by_barcode: bool = True,
-    chunk_size: int = 2000,
+    bin_size: int = 500000,
+    chunk_size: int = 200,
     tempdir: Path | None = None,
     backend: Literal['hdf5'] = 'hdf5',
 ) -> internal.AnnData:
@@ -374,6 +375,8 @@ def import_contacts(
         If `sorted_by_barcode == True`, this function makes use of small fixed amout of 
         memory. If `sorted_by_barcode == False` and `low_memory == False`,
         all data will be kept in memory. See `low_memory` for more details.
+    bin_size
+        The size of consecutive genomic regions used to record the counts.
     chunk_size
         Increasing the chunk_size speeds up I/O but uses more memory.
     tempdir
@@ -395,7 +398,7 @@ def import_contacts(
 
     adata = AnnData() if file is None else internal.AnnData(filename=file, backend=backend)
     internal.import_contacts(
-        adata, contact_file, chrom_size, sorted_by_barcode, chunk_size, tempdir
+        adata, contact_file, chrom_size, sorted_by_barcode, bin_size, chunk_size, tempdir
     )
     return adata
 

diff --git a/snapatac2-python/src/preprocessing.rs b/snapatac2-python/src/preprocessing.rs
@@ -144,6 +144,7 @@ pub(crate) fn import_contacts(
     contact_file: PathBuf,
     chrom_size: BTreeMap<String, u64>,
     fragment_is_sorted_by_name: bool,
+    bin_size: usize,
     chunk_size: usize,
     tempdir: Option<PathBuf>,
 ) -> Result<()>
@@ -171,7 +172,7 @@ pub(crate) fn import_contacts(
 
     macro_rules! run {
         ($data:expr) => {
-            preprocessing::import_contacts($data, sorted_contacts, &chrom_sizes, chunk_size)?
+            preprocessing::import_contacts($data, sorted_contacts, &chrom_sizes, bin_size, chunk_size)?
         };
     }
 
@@ -399,4 +400,4 @@ pub(crate) fn fragment_size_distribution(
     }
 
     crate::with_anndata!(&anndata, run)
-}
+}
diff --git a/snapatac2-python/src/utils/anndata.rs b/snapatac2-python/src/utils/anndata.rs
@@ -10,7 +10,7 @@ use pyanndata::anndata::memory;
 use pyanndata::{AnnData, AnnDataSet};
 use pyo3::prelude::*;
 
-use snapatac2_core::preprocessing::{qc, SnapData, GenomeCount, ContactMap, count_data::FragmentType};
+use snapatac2_core::preprocessing::{qc, SnapData, GenomeCount, count_data::FragmentType};
 
 pub struct PyAnnData<'py>(memory::PyAnnData<'py>);
 
@@ -157,16 +157,6 @@ impl<'py> SnapData for PyAnnData<'py> {
         Ok(GenomeCount::new(self.read_chrom_sizes()?, matrices))
     }
 
-    fn contact_count_iter(
-        &self, chunk_size: usize
-    ) -> Result<ContactMap<Self::CountIter>>
-    {
-        Ok(ContactMap::new(
-            self.read_chrom_sizes()?,
-            self.obsm().get_item_iter("contact", chunk_size).expect("'contact' not found in obsm"),
-        ))
-    }
-
     fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
         if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) {
             Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))
-Original file line number
+Diff line change
@@ Expand Up / @@ -197,4 +197,4 @@ where @@
             _ => panic!("id_type must be 'transcript' or 'gene'"),
         }
         Ok(())
-    }
+    }