work on the Genome object

kaizhang · Nov 4, 2023 · 6455ab3 · 6455ab3
1 parent dc41174
commit 6455ab3
Show file tree

Hide file tree

Showing 8 changed files with 145 additions and 136 deletions.
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -11,8 +11,8 @@ in the SnapATAC2 package.
     You can change the data cache directory by setting the `SNAP_DATA_DIR` environmental
     variable.
 
-Genome
-~~~~~~
+Genomes
+~~~~~~~
 
 .. autosummary::
     :toctree: _autosummary

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -5,6 +5,7 @@
 ### Features:
 
 - Support anndata v0.10.
+- Make it easier to build custom genome objects.
 
 ### Bugs fixed:
 

diff --git a/snapatac2-python/setup.py b/snapatac2-python/setup.py
@@ -47,7 +47,7 @@
         "igraph>=0.10.3",
         "pynndescent",
         "pyarrow",
-        "pyfaidx",
+        "pyfaidx>=0.7.0, <0.8.0",
         "rustworkx",
         "scipy>=1.4, <2.0.0",
         "scikit-learn>=1.0, <2.0.0",

diff --git a/snapatac2-python/snapatac2/genome.py b/snapatac2-python/snapatac2/genome.py
@@ -1,135 +1,145 @@
+from __future__ import annotations
+from collections.abc import Callable
+
 from snapatac2.datasets import datasets
+from pathlib import Path
 from pooch import Decompress
-import shutil
 
 class Genome:
-    def __init__(self, chrom_sizes, annotation_filename, fasta = None) -> None:
-        self.chrom_sizes = chrom_sizes
-        self.annotation_filename = annotation_filename
-        self.fasta_filename = fasta
-
-    #custom pooch downloader for local files
-    def local_cp(self,url, output_file, pooch):
-        shutil.copy(url,output_file)
-
-    def is_gz_file(self,filepath):
-        if os.path.exists(filepath):
-            return False
-        with open(filepath, 'rb') as test_f:
-            return test_f.read(2) == b'\x1f\x8b'
-
-    #You need to run this to make your custom assembly accessible downstream
-    #Provide a prefix if you are using annotations from cellranger
-    #where the files are named unidentifiably (genome.fa,genes.gtf.gz, etc)
-    #Adds the file to the pooch registry and gives it a new name if prefix is provided
-    def add_custom_genome(self, genome_prefix='',annotation_prefix=''):
-        self.fasta_cache_filename=genome_prefix+os.path.basename(self.fasta_filename)
-        self.annotation_cache_filename=annotation_prefix+os.path.basename(self.annotation_filename)
-        snap.datasets.datasets().registry[self.fasta_cache_filename]='sha256:'+pooch.file_hash(self.fasta_filename,alg='sha256')
-        snap.datasets.datasets().urls[self.fasta_cache_filename]=self.fasta_filename
-        snap.datasets.datasets().registry[self.annotation_cache_filename]='sha256:'+pooch.file_hash(self.annotation_filename,alg='sha256')
-        snap.datasets.datasets().urls[self.annotation_cache_filename]=self.annotation_filename
-
-    #Works by tricking pooch into using local_cp if the file isn't in the registry
-    def fetch_file(self,key_name,file_name):
-        if self.is_gz_file(file_name):
-            decomp=Decompress(method="gzip")
+    """
+    A class that encapsulates information about a genome, including its FASTA sequence,
+    its annotation, and chromosome sizes.
+
+    Attributes
+    ----------
+    fasta
+        The path to the FASTA file.
+    annotation
+        The path to the annotation file.
+    chrom_sizes
+        A dictionary containing chromosome names and sizes.
+
+    Raises
+    ------
+    ValueError
+        If `fasta` or `annotation` are not a Path, a string, or a callable.
+    """
+
+    def __init__(
+        self,
+        *,
+        fasta: Path | Callable[[], Path], 
+        annotation: Path | Callable[[], Path],
+        chrom_sizes: dict[str, int] | None = None,
+    ):
+        """
+        Initializes the Genome object with paths or callables for FASTA and annotation files,
+        and optionally, chromosome sizes.
+
+        Parameters
+        ----------
+        fasta
+            A Path or callable that returns a Path to the FASTA file.
+        annotation
+            A Path or callable that returns a Path to the annotation file.
+        chrom_sizes
+            Optional chromosome sizes. If not provided, chromosome sizes will
+            be inferred from the FASTA file.
+        """
+        if callable(fasta):
+            self._fetch_fasta = fasta
+            self._fasta = None
+        elif isinstance(fasta, Path) or isinstance(fasta, str):
+            self._fasta = Path(fasta)
+            self._fetch_fasta = None
         else:
-            decomp=None
-
-        try:
-            fetched = datasets().fetch(
-                key_name,
-                processor=decomp,
-                progressbar=True
-            )
-        except ValueError:
-            try:
-                fetched = datasets().fetch(
-                    key_name,
-                    processor=decomp,
-                    progressbar=True,
-                    downloader= self.local_cp
-                )
-            except Exception as e:
-                raise RuntimeError("The file can't be obtained.") from e
-        return fetched
-
-    def fetch_annotations(self):
-        return self.fetch_file(self.annotation_cache_filename,self.annotation_filename)
-
-    def fetch_fasta(self):
-        return self.fetch_file(self.fasta_cache_filename,self.fasta_filename)
+            raise ValueError("fasta must be a Path or Callable")
 
-GRCh37 = Genome(
-    {
-        "chr1": 249250621,
-        "chr2": 243199373,
-        "chr3": 198022430,
-        "chr4": 191154276,
-        "chr5": 180915260,
-        "chr6": 171115067,
-        "chr7": 159138663,
-        "chr8": 146364022,
-        "chr9": 141213431,
-        "chr10": 135534747,
-        "chr11": 135006516,
-        "chr12": 133851895,
-        "chr13": 115169878,
-        "chr14": 107349540,
-        "chr15": 102531392,
-        "chr16": 90354753,
-        "chr17": 81195210,
-        "chr18": 78077248,
-        "chr19": 59128983,
-        "chr20": 63025520,
-        "chr21": 48129895,
-        "chr22": 51304566,
-        "chrX": 155270560,
-        "chrY": 59373566,
-        "chrM": 16571,
-    },
-    "gencode_v41_GRCh37.gff3.gz",
-    "gencode_v41_GRCh37.fa.gz",
-)
+        if callable(annotation):
+            self._fetch_annotation = annotation
+            self._annotation = None
+        elif isinstance(annotation, Path) or isinstance(annotation, str):
+            self._annotation = Path(annotation)
+            self._fetch_annotation = None
+        else:
+            raise ValueError("annotation must be a Path or Callable")
+
+        self._chrom_sizes = chrom_sizes
+
+    @property
+    def fasta(self):
+        """
+        The Path to the FASTA file. 
 
+        Returns
+        -------
+        Path
+            The path to the FASTA file.
+        """
+        if self._fasta is None:
+            self._fasta = Path(self._fetch_fasta())
+        return self._fasta
+
+    @property
+    def annotation(self):
+        """
+        The Path to the annotation file.
+
+        Returns
+        -------
+        Path
+            The path to the annotation file.
+        """
+        if self._annotation is None:
+            self._annotation = Path(self._fetch_annotation())
+        return self._annotation
+
+    @property
+    def chrom_sizes(self):
+        """
+        A dictionary with chromosome names as keys and their lengths as values.
+
+        Returns
+        -------
+        dict[str, int]
+            A dictionary of chromosome sizes.
+        """
+        if self._chrom_sizes is None:
+            from pyfaidx import Fasta
+            fasta = Fasta(self.fasta)
+            self._chrom_sizes = {chr: len(fasta[chr]) for chr in fasta.keys()}
+        return self._chrom_sizes
+
+GRCh37 = Genome(
+    fasta=lambda : datasets().fetch(
+        "gencode_v41_GRCh37.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
+    annotation=lambda : datasets().fetch(
+        "gencode_v41_GRCh37.gff3.gz", progressbar=True),
+    )
 hg19 = GRCh37
 
 GRCh38 = Genome(
-    {
-        "chr1": 248956422,
-        "chr2": 242193529,
-        "chr3": 198295559,
-        "chr4": 190214555,
-        "chr5": 181538259,
-        "chr6": 170805979,
-        "chr7": 159345973,
-        "chr8": 145138636,
-        "chr9": 138394717,
-        "chr10": 133797422,
-        "chr11": 135086622,
-        "chr12": 133275309,
-        "chr13": 114364328,
-        "chr14": 107043718,
-        "chr15": 101991189,
-        "chr16": 90338345,
-        "chr17": 83257441,
-        "chr18": 80373285,
-        "chr19": 58617616,
-        "chr20": 64444167,
-        "chr21": 46709983,
-        "chr22": 50818468,
-        "chrX": 156040895,
-        "chrY": 57227415
-    },
-    "gencode_v41_GRCh38.gff3.gz",
-    "gencode_v41_GRCh38.fa.gz",
-)
-
+    fasta=lambda :  datasets().fetch(
+        "gencode_v41_GRCh38.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
+    annotation=lambda : datasets().fetch(
+        "gencode_v41_GRCh38.gff3.gz", progressbar=True),
+    chrom_sizes= {"chr1": 248956422, "chr2": 242193529, "chr3": 198295559,
+                  "chr4": 190214555, "chr5": 181538259, "chr6": 170805979,
+                  "chr7": 159345973, "chr8": 145138636, "chr9": 138394717,
+                  "chr10": 133797422, "chr11": 135086622, "chr12": 133275309,
+                  "chr13": 114364328, "chr14": 107043718, "chr15": 101991189,
+                  "chr16": 90338345, "chr17": 83257441, "chr18": 80373285,
+                  "chr19": 58617616, "chr20": 64444167, "chr21": 46709983,
+                  "chr22": 50818468, "chrX": 156040895, "chrY": 57227415},
+    )
 hg38 = GRCh38
 
 GRCm39 = Genome(
-    {
+    fasta=lambda : datasets().fetch(
+        "gencode_vM30_GRCm39.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
+    annotation=lambda : datasets().fetch(
+        "gencode_vM30_GRCm39.gff3.gz", progressbar=True),
+    chrom_sizes={
         "chr1": 195154279,
         "chr2": 181755017,
         "chr3": 159745316,
@@ -152,14 +162,15 @@ def fetch_fasta(self):
         "chrX": 169476592,
         "chrY": 91455967,
     },
-    "gencode_vM30_GRCm39.gff3.gz",
-    "gencode_vM30_GRCm39.fa.gz",
-)
-
+    )
 mm39 = GRCm39
 
 GRCm38 = Genome(
-    {
+    fasta=lambda : datasets().fetch(
+        "gencode_vM25_GRCm38.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
+    annotation=lambda : datasets().fetch(
+        "gencode_vM25_GRCm38.gff3.gz", progressbar=True),
+    chrom_sizes={
         "chr1": 195471971,
         "chr2": 182113224,
         "chr3": 160039680,
@@ -182,8 +193,5 @@ def fetch_fasta(self):
         "chrX": 171031299,
         "chrY": 91744698,
     },
-    "gencode_vM25_GRCm38.gff3.gz",
-    "gencode_vM25_GRCm38.fa.gz",
-)
-
+    )
 mm10 = GRCm38
diff --git a/snapatac2-python/snapatac2/metrics/__init__.py b/snapatac2-python/snapatac2/metrics/__init__.py
@@ -52,7 +52,7 @@ def tsse(
     AAATGAGAGTCCCGCA-1    33.264463
     Name: tsse, dtype: float64
     """
-    gene_anno = gene_anno.fetch_annotations() if isinstance(gene_anno, Genome) else gene_anno
+    gene_anno = gene_anno.annotation if isinstance(gene_anno, Genome) else gene_anno
 
     if isinstance(adata, list):
         result = snapatac2._utils.anndata_par(

diff --git a/snapatac2-python/snapatac2/preprocessing/_basic.py b/snapatac2-python/snapatac2/preprocessing/_basic.py
@@ -594,7 +594,7 @@ def make_gene_matrix(
         obs: 'n_fragment', 'frac_dup', 'frac_mito'
     """
     if isinstance(gene_anno, Genome):
-        gene_anno = gene_anno.fetch_annotations()
+        gene_anno = gene_anno.annotation
 
     if inplace:
         internal.mk_gene_matrix(adata, gene_anno, chunk_size, use_x, id_type, None)

diff --git a/snapatac2-python/snapatac2/tools/_motif.py b/snapatac2-python/snapatac2/tools/_motif.py
@@ -61,7 +61,7 @@ def count_occurrence(query, idx_map, bound):
     region_to_idx = dict(map(lambda x: (x[1], x[0]), enumerate(all_regions)))
 
     logging.info("Fetching {} sequences ...".format(len(all_regions)))
-    genome = genome_fasta.fetch_fasta() if isinstance(genome_fasta, Genome) else str(genome_fasta)
+    genome = genome_fasta.fasta if isinstance(genome_fasta, Genome) else str(genome_fasta)
     genome = Fasta(genome, one_based_attributes=False)
     sequences = [fetch_seq(genome, region) for region in all_regions]
 

diff --git a/snapatac2-python/snapatac2/tools/_network.py b/snapatac2-python/snapatac2/tools/_network.py
@@ -76,7 +76,7 @@ def init_network_from_annotation(
         regulatory domains.
     """
     if isinstance(anno_file, Genome):
-        anno_file = anno_file.fetch_annotations()
+        anno_file = anno_file.annotation
 
     region_added = {}
     graph = rx.PyDiGraph()
@@ -252,7 +252,7 @@ def add_tf_binding(
 
     regions = [(i, network[i].id) for i in network.node_indices() if network[i].type == "region"]
     logging.info("Fetching {} sequences ...".format(len(regions)))
-    genome = genome_fasta.fetch_fasta() if isinstance(genome_fasta, Genome) else str(genome_fasta)
+    genome = genome_fasta.fasta if isinstance(genome_fasta, Genome) else str(genome_fasta)
     genome = Fasta(genome, one_based_attributes=False)
     sequences = [fetch_seq(genome, region) for _, region in regions]