Skip to content

Commit

Permalink
Modify genome class to allow for custom genome annotations (#185)
Browse files Browse the repository at this point in the history
Co-authored-by: Matthew Schmitz <[email protected]>
  • Loading branch information
mtvector and Matthew Schmitz authored Nov 4, 2023
1 parent dfeff6b commit dc41174
Showing 1 changed file with 53 additions and 8 deletions.
61 changes: 53 additions & 8 deletions snapatac2-python/snapatac2/genome.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,65 @@
from snapatac2.datasets import datasets
from pooch import Decompress
import shutil

class Genome:
def __init__(self, chrom_sizes, annotation_filename, fasta = None) -> None:
self.chrom_sizes = chrom_sizes
self._annotation_filename = annotation_filename
self._fasta_filename = fasta
self.annotation_filename = annotation_filename
self.fasta_filename = fasta

def fetch_annotations(self):
return datasets().fetch(self._annotation_filename, progressbar=True)
#custom pooch downloader for local files
def local_cp(self,url, output_file, pooch):
shutil.copy(url,output_file)

def is_gz_file(self,filepath):
if os.path.exists(filepath):
return False
with open(filepath, 'rb') as test_f:
return test_f.read(2) == b'\x1f\x8b'

#You need to run this to make your custom assembly accessible downstream
#Provide a prefix if you are using annotations from cellranger
#where the files are named unidentifiably (genome.fa,genes.gtf.gz, etc)
#Adds the file to the pooch registry and gives it a new name if prefix is provided
def add_custom_genome(self, genome_prefix='',annotation_prefix=''):
self.fasta_cache_filename=genome_prefix+os.path.basename(self.fasta_filename)
self.annotation_cache_filename=annotation_prefix+os.path.basename(self.annotation_filename)
snap.datasets.datasets().registry[self.fasta_cache_filename]='sha256:'+pooch.file_hash(self.fasta_filename,alg='sha256')
snap.datasets.datasets().urls[self.fasta_cache_filename]=self.fasta_filename
snap.datasets.datasets().registry[self.annotation_cache_filename]='sha256:'+pooch.file_hash(self.annotation_filename,alg='sha256')
snap.datasets.datasets().urls[self.annotation_cache_filename]=self.annotation_filename

#Works by tricking pooch into using local_cp if the file isn't in the registry
def fetch_file(self,key_name,file_name):
if self.is_gz_file(file_name):
decomp=Decompress(method="gzip")
else:
decomp=None

try:
fetched = datasets().fetch(
key_name,
processor=decomp,
progressbar=True
)
except ValueError:
try:
fetched = datasets().fetch(
key_name,
processor=decomp,
progressbar=True,
downloader= self.local_cp
)
except Exception as e:
raise RuntimeError("The file can't be obtained.") from e
return fetched

def fetch_annotations(self):
return self.fetch_file(self.annotation_cache_filename,self.annotation_filename)

def fetch_fasta(self):
return datasets().fetch(
self._fasta_filename,
processor=Decompress(method = "gzip"),
progressbar=True)
return self.fetch_file(self.fasta_cache_filename,self.fasta_filename)

GRCh37 = Genome(
{
Expand Down

0 comments on commit dc41174

Please sign in to comment.