Skip to content

Commit

Permalink
Nicer imports
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdalpino committed Oct 16, 2024
1 parent 599e6cd commit cdbc251
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 16 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DNA Hash

A datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics. DNA Hash stores k-mer sequence counts by their up2bit encoding - a two-way hash that works with variable-length sequences. As such, DNA Hash uses considerably less memory than a lookup table that stores sequences in plaintext. In addition, DNA Hash's novel autoscaling Bloom filter eliminates the need to explicitly store counts for sequences that have only been seen once.
A specialized datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics. DNA Hash stores k-mer sequence counts by their up2bit encoding - a two-way hash that works with variable-length sequences. As such, DNA Hash uses considerably less memory than a lookup table that stores sequences in plaintext. In addition, DNA Hash's novel autoscaling Bloom filter eliminates the need to explicitly store counts for sequences that have only been seen once.

- **Variable** sequence lengths
- **Ultra-low** memory footprint
Expand All @@ -21,14 +21,15 @@ pip install dnahash
## Example Usage

```python
from dna_hash import DNAHash, tokenizers
from dna_hash import DNAHash
from dna_hash.tokenizers import Kmer, Canonical

from Bio import SeqIO
from matplotlib import pyplot as plt

hash_table = DNAHash(max_false_positive_rate=0.001)

tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
tokenizer = Canonical(Kmer(6))

with open('covid-19-virus.fasta', 'r') as file:
for record in SeqIO.parse(file, 'fasta'):
Expand Down
5 changes: 3 additions & 2 deletions examples/histogram.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from dna_hash import DNAHash, tokenizers
from dna_hash import DNAHash
from dna_hash.tokenizers import Kmer, Canonical

from Bio import SeqIO
from matplotlib import pyplot as plt

hash_table = DNAHash(max_false_positive_rate=0.001)

tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
tokenizer = Canonical(Kmer(6))

with open('covid-19-virus.fasta', 'r') as file:
for record in SeqIO.parse(file, 'fasta'):
Expand Down
5 changes: 3 additions & 2 deletions examples/top_k.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from dna_hash import DNAHash, tokenizers
from dna_hash import DNAHash
from dna_hash.tokenizers import Kmer, Canonical

from Bio import SeqIO

hash_table = DNAHash(max_false_positive_rate=0.001)

tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
tokenizer = Canonical(Kmer(6))

with open('covid-19-virus.fasta', 'r') as file:
for record in SeqIO.parse(file, 'fasta'):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "DNAHash"
description = "A datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics."
description = "A specialized datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics."
version = "0.0.1"
requires-python = ">= 3.10"
dependencies = [
Expand Down
8 changes: 4 additions & 4 deletions tests/test_dna_hash.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import random

import dna_hash
from dna_hash import DNAHash

class TestDNAHash(unittest.TestCase):
BASES = ['A', 'C', 'T', 'G']
Expand All @@ -11,7 +11,7 @@ def random_read(cls, k: int) -> str:
return ''.join(cls.BASES[random.randint(0, 3)] for i in range(0, k))

def test_increment(self):
hash_table = dna_hash.DNAHash()
hash_table = DNAHash()

self.assertEqual(hash_table.num_singletons, 0)
self.assertEqual(hash_table.num_sequences, 0)
Expand All @@ -35,7 +35,7 @@ def test_increment(self):
self.assertEqual(hash_table.argmax(), 'ACTG')

def test_top_k(self):
hash_table = dna_hash.DNAHash()
hash_table = DNAHash()

hash_table['CTGA'] = 1
hash_table['ACTG'] = 10
Expand All @@ -54,7 +54,7 @@ def test_top_k(self):
def test_advanced(self):
random.seed(1)

hash_table = dna_hash.DNAHash()
hash_table = DNAHash()

for i in range(0, 100000):
hash_table.increment(self.random_read(8))
Expand Down
8 changes: 4 additions & 4 deletions tests/test_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import unittest

from dna_hash import tokenizers
from dna_hash.tokenizers import Kmer, Canonical, Fragment

class TestKmer(unittest.TestCase):
def test_tokenize(self):
tokenizer = tokenizers.Kmer(k=6)
tokenizer = Kmer(k=6)

tokens = tokenizer.tokenize('CGGTTCAGCANG')

Expand All @@ -17,7 +17,7 @@ def test_tokenize(self):

class TestCanonical(unittest.TestCase):
def test_tokenize(self):
tokenizer = tokenizers.Canonical(tokenizers.Kmer(k=6))
tokenizer = Canonical(Kmer(k=6))

tokens = tokenizer.tokenize('CGGTTCAGCANG')

Expand All @@ -28,7 +28,7 @@ def test_tokenize(self):

class TestFragment(unittest.TestCase):
def test_tokenize(self):
tokenizer = tokenizers.Fragment(n=4)
tokenizer = Fragment(n=4)

tokens = tokenizer.tokenize('CGGTTCAGCANGTAAT')

Expand Down

0 comments on commit cdbc251

Please sign in to comment.