Skip to content

Commit

Permalink
work on the Genome object
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Nov 4, 2023
1 parent dc41174 commit 6455ab3
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 136 deletions.
4 changes: 2 additions & 2 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ in the SnapATAC2 package.
You can change the data cache directory by setting the `SNAP_DATA_DIR` environmental
variable.

Genome
~~~~~~
Genomes
~~~~~~~

.. autosummary::
:toctree: _autosummary
Expand Down
1 change: 1 addition & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
### Features:

- Support anndata v0.10.
- Make it easier to build custom genome objects.

### Bugs fixed:

Expand Down
2 changes: 1 addition & 1 deletion snapatac2-python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"igraph>=0.10.3",
"pynndescent",
"pyarrow",
"pyfaidx",
"pyfaidx>=0.7.0, <0.8.0",
"rustworkx",
"scipy>=1.4, <2.0.0",
"scikit-learn>=1.0, <2.0.0",
Expand Down
264 changes: 136 additions & 128 deletions snapatac2-python/snapatac2/genome.py
Original file line number Diff line number Diff line change
@@ -1,135 +1,145 @@
from __future__ import annotations
from collections.abc import Callable

from snapatac2.datasets import datasets
from pathlib import Path
from pooch import Decompress
import shutil

class Genome:
def __init__(self, chrom_sizes, annotation_filename, fasta = None) -> None:
self.chrom_sizes = chrom_sizes
self.annotation_filename = annotation_filename
self.fasta_filename = fasta

#custom pooch downloader for local files
def local_cp(self,url, output_file, pooch):
shutil.copy(url,output_file)

def is_gz_file(self,filepath):
if os.path.exists(filepath):
return False
with open(filepath, 'rb') as test_f:
return test_f.read(2) == b'\x1f\x8b'

#You need to run this to make your custom assembly accessible downstream
#Provide a prefix if you are using annotations from cellranger
#where the files are named unidentifiably (genome.fa,genes.gtf.gz, etc)
#Adds the file to the pooch registry and gives it a new name if prefix is provided
def add_custom_genome(self, genome_prefix='',annotation_prefix=''):
self.fasta_cache_filename=genome_prefix+os.path.basename(self.fasta_filename)
self.annotation_cache_filename=annotation_prefix+os.path.basename(self.annotation_filename)
snap.datasets.datasets().registry[self.fasta_cache_filename]='sha256:'+pooch.file_hash(self.fasta_filename,alg='sha256')
snap.datasets.datasets().urls[self.fasta_cache_filename]=self.fasta_filename
snap.datasets.datasets().registry[self.annotation_cache_filename]='sha256:'+pooch.file_hash(self.annotation_filename,alg='sha256')
snap.datasets.datasets().urls[self.annotation_cache_filename]=self.annotation_filename

#Works by tricking pooch into using local_cp if the file isn't in the registry
def fetch_file(self,key_name,file_name):
if self.is_gz_file(file_name):
decomp=Decompress(method="gzip")
"""
A class that encapsulates information about a genome, including its FASTA sequence,
its annotation, and chromosome sizes.
Attributes
----------
fasta
The path to the FASTA file.
annotation
The path to the annotation file.
chrom_sizes
A dictionary containing chromosome names and sizes.
Raises
------
ValueError
If `fasta` or `annotation` are not a Path, a string, or a callable.
"""

def __init__(
self,
*,
fasta: Path | Callable[[], Path],
annotation: Path | Callable[[], Path],
chrom_sizes: dict[str, int] | None = None,
):
"""
Initializes the Genome object with paths or callables for FASTA and annotation files,
and optionally, chromosome sizes.
Parameters
----------
fasta
A Path or callable that returns a Path to the FASTA file.
annotation
A Path or callable that returns a Path to the annotation file.
chrom_sizes
Optional chromosome sizes. If not provided, chromosome sizes will
be inferred from the FASTA file.
"""
if callable(fasta):
self._fetch_fasta = fasta
self._fasta = None
elif isinstance(fasta, Path) or isinstance(fasta, str):
self._fasta = Path(fasta)
self._fetch_fasta = None
else:
decomp=None

try:
fetched = datasets().fetch(
key_name,
processor=decomp,
progressbar=True
)
except ValueError:
try:
fetched = datasets().fetch(
key_name,
processor=decomp,
progressbar=True,
downloader= self.local_cp
)
except Exception as e:
raise RuntimeError("The file can't be obtained.") from e
return fetched

def fetch_annotations(self):
return self.fetch_file(self.annotation_cache_filename,self.annotation_filename)

def fetch_fasta(self):
return self.fetch_file(self.fasta_cache_filename,self.fasta_filename)
raise ValueError("fasta must be a Path or Callable")

GRCh37 = Genome(
{
"chr1": 249250621,
"chr2": 243199373,
"chr3": 198022430,
"chr4": 191154276,
"chr5": 180915260,
"chr6": 171115067,
"chr7": 159138663,
"chr8": 146364022,
"chr9": 141213431,
"chr10": 135534747,
"chr11": 135006516,
"chr12": 133851895,
"chr13": 115169878,
"chr14": 107349540,
"chr15": 102531392,
"chr16": 90354753,
"chr17": 81195210,
"chr18": 78077248,
"chr19": 59128983,
"chr20": 63025520,
"chr21": 48129895,
"chr22": 51304566,
"chrX": 155270560,
"chrY": 59373566,
"chrM": 16571,
},
"gencode_v41_GRCh37.gff3.gz",
"gencode_v41_GRCh37.fa.gz",
)
if callable(annotation):
self._fetch_annotation = annotation
self._annotation = None
elif isinstance(annotation, Path) or isinstance(annotation, str):
self._annotation = Path(annotation)
self._fetch_annotation = None
else:
raise ValueError("annotation must be a Path or Callable")

self._chrom_sizes = chrom_sizes

@property
def fasta(self):
"""
The Path to the FASTA file.
Returns
-------
Path
The path to the FASTA file.
"""
if self._fasta is None:
self._fasta = Path(self._fetch_fasta())
return self._fasta

@property
def annotation(self):
"""
The Path to the annotation file.
Returns
-------
Path
The path to the annotation file.
"""
if self._annotation is None:
self._annotation = Path(self._fetch_annotation())
return self._annotation

@property
def chrom_sizes(self):
"""
A dictionary with chromosome names as keys and their lengths as values.
Returns
-------
dict[str, int]
A dictionary of chromosome sizes.
"""
if self._chrom_sizes is None:
from pyfaidx import Fasta
fasta = Fasta(self.fasta)
self._chrom_sizes = {chr: len(fasta[chr]) for chr in fasta.keys()}
return self._chrom_sizes

GRCh37 = Genome(
fasta=lambda : datasets().fetch(
"gencode_v41_GRCh37.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
annotation=lambda : datasets().fetch(
"gencode_v41_GRCh37.gff3.gz", progressbar=True),
)
hg19 = GRCh37

GRCh38 = Genome(
{
"chr1": 248956422,
"chr2": 242193529,
"chr3": 198295559,
"chr4": 190214555,
"chr5": 181538259,
"chr6": 170805979,
"chr7": 159345973,
"chr8": 145138636,
"chr9": 138394717,
"chr10": 133797422,
"chr11": 135086622,
"chr12": 133275309,
"chr13": 114364328,
"chr14": 107043718,
"chr15": 101991189,
"chr16": 90338345,
"chr17": 83257441,
"chr18": 80373285,
"chr19": 58617616,
"chr20": 64444167,
"chr21": 46709983,
"chr22": 50818468,
"chrX": 156040895,
"chrY": 57227415
},
"gencode_v41_GRCh38.gff3.gz",
"gencode_v41_GRCh38.fa.gz",
)

fasta=lambda : datasets().fetch(
"gencode_v41_GRCh38.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
annotation=lambda : datasets().fetch(
"gencode_v41_GRCh38.gff3.gz", progressbar=True),
chrom_sizes= {"chr1": 248956422, "chr2": 242193529, "chr3": 198295559,
"chr4": 190214555, "chr5": 181538259, "chr6": 170805979,
"chr7": 159345973, "chr8": 145138636, "chr9": 138394717,
"chr10": 133797422, "chr11": 135086622, "chr12": 133275309,
"chr13": 114364328, "chr14": 107043718, "chr15": 101991189,
"chr16": 90338345, "chr17": 83257441, "chr18": 80373285,
"chr19": 58617616, "chr20": 64444167, "chr21": 46709983,
"chr22": 50818468, "chrX": 156040895, "chrY": 57227415},
)
hg38 = GRCh38

GRCm39 = Genome(
{
fasta=lambda : datasets().fetch(
"gencode_vM30_GRCm39.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
annotation=lambda : datasets().fetch(
"gencode_vM30_GRCm39.gff3.gz", progressbar=True),
chrom_sizes={
"chr1": 195154279,
"chr2": 181755017,
"chr3": 159745316,
Expand All @@ -152,14 +162,15 @@ def fetch_fasta(self):
"chrX": 169476592,
"chrY": 91455967,
},
"gencode_vM30_GRCm39.gff3.gz",
"gencode_vM30_GRCm39.fa.gz",
)

)
mm39 = GRCm39

GRCm38 = Genome(
{
fasta=lambda : datasets().fetch(
"gencode_vM25_GRCm38.fa.gz", processor=Decompress(method = "gzip"), progressbar=True),
annotation=lambda : datasets().fetch(
"gencode_vM25_GRCm38.gff3.gz", progressbar=True),
chrom_sizes={
"chr1": 195471971,
"chr2": 182113224,
"chr3": 160039680,
Expand All @@ -182,8 +193,5 @@ def fetch_fasta(self):
"chrX": 171031299,
"chrY": 91744698,
},
"gencode_vM25_GRCm38.gff3.gz",
"gencode_vM25_GRCm38.fa.gz",
)

)
mm10 = GRCm38
2 changes: 1 addition & 1 deletion snapatac2-python/snapatac2/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def tsse(
AAATGAGAGTCCCGCA-1 33.264463
Name: tsse, dtype: float64
"""
gene_anno = gene_anno.fetch_annotations() if isinstance(gene_anno, Genome) else gene_anno
gene_anno = gene_anno.annotation if isinstance(gene_anno, Genome) else gene_anno

if isinstance(adata, list):
result = snapatac2._utils.anndata_par(
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-python/snapatac2/preprocessing/_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ def make_gene_matrix(
obs: 'n_fragment', 'frac_dup', 'frac_mito'
"""
if isinstance(gene_anno, Genome):
gene_anno = gene_anno.fetch_annotations()
gene_anno = gene_anno.annotation

if inplace:
internal.mk_gene_matrix(adata, gene_anno, chunk_size, use_x, id_type, None)
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-python/snapatac2/tools/_motif.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def count_occurrence(query, idx_map, bound):
region_to_idx = dict(map(lambda x: (x[1], x[0]), enumerate(all_regions)))

logging.info("Fetching {} sequences ...".format(len(all_regions)))
genome = genome_fasta.fetch_fasta() if isinstance(genome_fasta, Genome) else str(genome_fasta)
genome = genome_fasta.fasta if isinstance(genome_fasta, Genome) else str(genome_fasta)
genome = Fasta(genome, one_based_attributes=False)
sequences = [fetch_seq(genome, region) for region in all_regions]

Expand Down
4 changes: 2 additions & 2 deletions snapatac2-python/snapatac2/tools/_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def init_network_from_annotation(
regulatory domains.
"""
if isinstance(anno_file, Genome):
anno_file = anno_file.fetch_annotations()
anno_file = anno_file.annotation

region_added = {}
graph = rx.PyDiGraph()
Expand Down Expand Up @@ -252,7 +252,7 @@ def add_tf_binding(

regions = [(i, network[i].id) for i in network.node_indices() if network[i].type == "region"]
logging.info("Fetching {} sequences ...".format(len(regions)))
genome = genome_fasta.fetch_fasta() if isinstance(genome_fasta, Genome) else str(genome_fasta)
genome = genome_fasta.fasta if isinstance(genome_fasta, Genome) else str(genome_fasta)
genome = Fasta(genome, one_based_attributes=False)
sequences = [fetch_seq(genome, region) for _, region in regions]

Expand Down

0 comments on commit 6455ab3

Please sign in to comment.