Nicer imports

andrewdalpino · Oct 16, 2024 · cdbc251 · cdbc251
1 parent 599e6cd
commit cdbc251
Show file tree

Hide file tree

Showing 6 changed files with 19 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # DNA Hash
 
-A datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics. DNA Hash stores k-mer sequence counts by their up2bit encoding - a two-way hash that works with variable-length sequences. As such, DNA Hash uses considerably less memory than a lookup table that stores sequences in plaintext. In addition, DNA Hash's novel autoscaling Bloom filter eliminates the need to explicitly store counts for sequences that have only been seen once.
+A specialized datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics. DNA Hash stores k-mer sequence counts by their up2bit encoding - a two-way hash that works with variable-length sequences. As such, DNA Hash uses considerably less memory than a lookup table that stores sequences in plaintext. In addition, DNA Hash's novel autoscaling Bloom filter eliminates the need to explicitly store counts for sequences that have only been seen once.
 
 - **Variable** sequence lengths
 - **Ultra-low** memory footprint
@@ -21,14 +21,15 @@ pip install dnahash
 ## Example Usage
 
 ```python
-from dna_hash import DNAHash, tokenizers
+from dna_hash import DNAHash
+from dna_hash.tokenizers import Kmer, Canonical
 
 from Bio import SeqIO
 from matplotlib import pyplot as plt
 
 hash_table = DNAHash(max_false_positive_rate=0.001)
 
-tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
+tokenizer = Canonical(Kmer(6))
 
 with open('covid-19-virus.fasta', 'r') as file:
     for record in SeqIO.parse(file, 'fasta'):

diff --git a/examples/histogram.py b/examples/histogram.py
@@ -1,11 +1,12 @@
-from dna_hash import DNAHash, tokenizers
+from dna_hash import DNAHash
+from dna_hash.tokenizers import Kmer, Canonical
 
 from Bio import SeqIO
 from matplotlib import pyplot as plt
 
 hash_table = DNAHash(max_false_positive_rate=0.001)
 
-tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
+tokenizer = Canonical(Kmer(6))
 
 with open('covid-19-virus.fasta', 'r') as file:
     for record in SeqIO.parse(file, 'fasta'):

diff --git a/examples/top_k.py b/examples/top_k.py
@@ -1,10 +1,11 @@
-from dna_hash import DNAHash, tokenizers
+from dna_hash import DNAHash
+from dna_hash.tokenizers import Kmer, Canonical
 
 from Bio import SeqIO
 
 hash_table = DNAHash(max_false_positive_rate=0.001)
 
-tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
+tokenizer = Canonical(Kmer(6))
 
 with open('covid-19-virus.fasta', 'r') as file:
     for record in SeqIO.parse(file, 'fasta'):

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "DNAHash"
-description = "A datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics."
+description = "A specialized datastructure and tokenization library for counting short DNA sequences for use in Bioinformatics."
 version = "0.0.1"
 requires-python = ">= 3.10"
 dependencies = [

diff --git a/tests/test_dna_hash.py b/tests/test_dna_hash.py
@@ -1,7 +1,7 @@
 import unittest
 import random
 
-import dna_hash
+from dna_hash import DNAHash
 
 class TestDNAHash(unittest.TestCase):
     BASES = ['A', 'C', 'T', 'G']
@@ -11,7 +11,7 @@ def random_read(cls, k: int) -> str:
         return ''.join(cls.BASES[random.randint(0, 3)] for i in range(0, k))
 
     def test_increment(self):
-        hash_table = dna_hash.DNAHash()
+        hash_table = DNAHash()
 
         self.assertEqual(hash_table.num_singletons, 0)
         self.assertEqual(hash_table.num_sequences, 0)
@@ -35,7 +35,7 @@ def test_increment(self):
         self.assertEqual(hash_table.argmax(), 'ACTG')
 
     def test_top_k(self):
-        hash_table = dna_hash.DNAHash()
+        hash_table = DNAHash()
 
         hash_table['CTGA'] = 1
         hash_table['ACTG'] = 10
@@ -54,7 +54,7 @@ def test_top_k(self):
     def test_advanced(self):
         random.seed(1)
 
-        hash_table = dna_hash.DNAHash()
+        hash_table = DNAHash()
 
         for i in range(0, 100000):
             hash_table.increment(self.random_read(8))

diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py
@@ -1,10 +1,10 @@
 import unittest
 
-from dna_hash import tokenizers
+from dna_hash.tokenizers import Kmer, Canonical, Fragment
 
 class TestKmer(unittest.TestCase):
     def test_tokenize(self):
-        tokenizer = tokenizers.Kmer(k=6)
+        tokenizer = Kmer(k=6)
 
         tokens = tokenizer.tokenize('CGGTTCAGCANG')
 
@@ -17,7 +17,7 @@ def test_tokenize(self):
 
 class TestCanonical(unittest.TestCase):
     def test_tokenize(self):
-        tokenizer = tokenizers.Canonical(tokenizers.Kmer(k=6))
+        tokenizer = Canonical(Kmer(k=6))
 
         tokens = tokenizer.tokenize('CGGTTCAGCANG')
 
@@ -28,7 +28,7 @@ def test_tokenize(self):
 
 class TestFragment(unittest.TestCase):
     def test_tokenize(self):
-        tokenizer = tokenizers.Fragment(n=4)
+        tokenizer = Fragment(n=4)
 
         tokens = tokenizer.tokenize('CGGTTCAGCANGTAAT')