Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/python3'
Browse files Browse the repository at this point in the history
Now completely python 3 with additional bug fixes and improvements
-non-ACTG error handling
-proper handling of non-complete sketches
-additional script tests
etc.
close #13
  • Loading branch information
dkoslicki committed Jan 25, 2020
2 parents 2e531fd + 3023104 commit f13deed
Show file tree
Hide file tree
Showing 19 changed files with 256 additions and 36 deletions.
72 changes: 44 additions & 28 deletions CMash/MinHash.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@
import multiprocessing
from multiprocessing import Pool
import re
from blist import * # note, the import functions import the _mins etc. as lists, and the CE class imports them as blists.
# This shouldn't cause an issue, but will lead to slow performance if a CE is imported, then additional things are added.
# I.e. If you import a CE, don't add new elements, or you might have a bad day (or at least a long one).
import bisect
import ctypes
import warnings
Expand Down Expand Up @@ -63,8 +60,7 @@ class CountEstimator(object):
n is the number of sketches to keep
Still don't know what max_prime is...
"""

def __init__(self, n=None, max_prime=9999999999971., ksize=None, input_file_name=None, save_kmers='n', hash_list=None,
def __init__(self, n=None, max_prime=9999999999971., ksize=None, input_file_name=None, save_kmers='y', hash_list=None,
rev_comp=False):
if n is None:
raise Exception
Expand All @@ -82,15 +78,19 @@ def __init__(self, n=None, max_prime=9999999999971., ksize=None, input_file_name
self.p = p

# initialize sketch to size n
self._mins = [p] * n
#self._mins = [float("inf")]*n
self._mins = blist([np.Inf]*n)
#self._mins = blist([np.Inf]*n)

# initialize the corresponding counts
self._counts = blist([0]*n)
#self._counts = blist([0]*n)
self._counts = [0] * n

# initialize the list of kmers used, if appropriate
if save_kmers == 'y':
self._kmers = blist([b'']*n)
#self._kmers = blist([b'']*n) # TODO: see if I can remove the b'' so I don't have to decode byte strings later, may conflict with HDF5
#self._kmers = [b''] * n
self._kmers = [''] * n
else:
self._kmers = None

Expand Down Expand Up @@ -157,7 +157,7 @@ def add(self, kmer, rev_comp=False):
_counts.insert(i, 1)
_counts.pop()
if _kmers:
_kmers.insert(i, np.string_(kmer))
_kmers.insert(i, kmer)
_kmers.pop()
return

Expand All @@ -167,12 +167,19 @@ def add_sequence(self, seq, rev_comp=False):
"""
Sanitize and add a sequence to the sketch.
"""
seq = seq.upper() # otherwise their hashes will be different
# seq = seq.upper().replace('N', 'G')
seq = notACTG.sub('G', seq.upper()) # more intelligent sanatization?
#seq = notACTG.sub('G', seq.upper()) # more intelligent sanatization?
# seq = seq.upper()
for kmer in kmers(seq, self.ksize):
#if notACTG.search(kmer) is None: # If it's free of non-ACTG characters
self.add(kmer, rev_comp)
seq_split_onlyACTG = notACTG.split(seq)
if len(seq_split_onlyACTG) == 1:
for kmer in kmers(seq, self.ksize):
#if notACTG.search(kmer) is None: # If it's free of non-ACTG characters
self.add(kmer, rev_comp)
else:
for sub_seq in seq_split_onlyACTG:
if sub_seq:
self.add_sequence(sub_seq, rev_comp=rev_comp)

def jaccard_count(self, other):
"""
Expand All @@ -188,7 +195,7 @@ def jaccard_count(self, other):
return (total2 / float(sum(other._counts)), total1 / float(sum(self._counts)))
# The entries here are returned as (A_{CE1,CE2}, A_{CE2,CE1})

def jaccard(self, other):
def jaccard(self, other): # TODO: wait, shouldn't this be called containment?!?
"""
Jaccard index
"""
Expand Down Expand Up @@ -244,7 +251,7 @@ def export(self, export_file_name):
mins_data = grp.create_dataset("mins", data=self._mins)
counts_data = grp.create_dataset("counts", data=self._counts)
if self._kmers:
kmer_data = grp.create_dataset("kmers", data=self._kmers)
kmer_data = grp.create_dataset("kmers", data=[np.string_(kmer) for kmer in self._kmers])

grp.attrs['class'] = np.string_("CountEstimator")
grp.attrs['filename'] = np.string_(self.input_file_name)
Expand Down Expand Up @@ -303,7 +310,8 @@ def import_single_hdf5(file_name):
CE._true_num_kmers = true_num_kmers
CE.input_file_name = file_name
if "kmers" in grp:
CE._kmers = grp["kmers"][...]
temp_kmers = grp["kmers"][...]
CE._kmers = [kmer.decode('utf-8') for kmer in temp_kmers]
else:
CE._kmers = None

Expand Down Expand Up @@ -356,7 +364,7 @@ def export_multiple_to_single_hdf5(CEs, export_file_name):
mins_data = subgrp.create_dataset("mins", data=CE._mins)
counts_data = subgrp.create_dataset("counts", data=CE._counts)
if CE._kmers is not None:
kmer_data = subgrp.create_dataset("kmers", data=CE._kmers)
kmer_data = subgrp.create_dataset("kmers", data=[np.string_(kmer) for kmer in CE._kmers])

subgrp.attrs['class'] = np.string_("CountEstimator")
subgrp.attrs['filename'] = np.string_(CE.input_file_name) # But keep the full file name on hand
Expand Down Expand Up @@ -412,7 +420,8 @@ def import_multiple_from_single_hdf5(file_name, import_list=None):
CE._true_num_kmers = true_num_kmers
CE.input_file_name = file_name
if "kmers" in subgrp:
CE._kmers = subgrp["kmers"][...]
temp_kmers = subgrp["kmers"][...]
CE._kmers = [kmer.decode('utf-8') for kmer in temp_kmers]
else:
CE._kmers = None

Expand Down Expand Up @@ -498,7 +507,7 @@ def insert_to_database(database_location, insert_list):
mins_data = subgrp.create_dataset("mins", data=to_insert_CE._mins)
counts_data = subgrp.create_dataset("counts", data=to_insert_CE._counts)
if to_insert_CE._kmers:
kmer_data = subgrp.create_dataset("kmers", data=to_insert_CE._kmers)
kmer_data = subgrp.create_dataset("kmers", data=[np.string_(kmer) for kmer in to_insert_CE._kmers])
subgrp.attrs['class'] = np.string_("CountEstimator")
subgrp.attrs['filename'] = np.string_(to_insert_CE.input_file_name) # But keep the full file name on hand
subgrp.attrs['ksize'] = to_insert_CE.ksize
Expand Down Expand Up @@ -1008,13 +1017,14 @@ def test_import_export():

def test_hash_list():
CE1 = CountEstimator(n=5, max_prime=1e10, ksize=3, save_kmers='y')
seq1='acgtagtctagtctacgtagtcgttgtattataaaatcgtcgtagctagtgctat'
seq1 = 'acgtagtctagtctacgtagtcgttgtattataaaatcgtcgtagctagtgctat'
CE1.add_sequence(seq1)
hash_list = {424517919, 660397082}
#hash_list = {424517919, 660397082}
hash_list = set(CE1._mins[0:2]) # pick off two of the hashes, so the jaccard should be 2/5 = .4
CE2 = CountEstimator(n=5, max_prime=1e10, ksize=3, hash_list=hash_list, save_kmers='y')
CE2.add_sequence(seq1)
assert CE1.jaccard(CE2) == 0.4
assert CE1.jaccard_count(CE2) == (1.0, 2/7.)
assert CE1.jaccard_count(CE2) == (1.0, 2/9.)


def test_vector_formation():
Expand All @@ -1028,12 +1038,13 @@ def test_vector_formation():
CE2.add_sequence(seq2)
CE3.add_sequence(seq3)
Y = CE1.count_vector([CE1, CE2, CE3])
assert (Y == np.array([1.,1.,0.5625])).all()
assert (np.sum(np.abs(Y - np.array([1.,1.,0.55555556]))))<.00001
Y2 = CE1.jaccard_vector([CE1, CE2, CE3])
assert (Y2 == np.array([1.,1.,0.4])).all()
assert (Y2 == np.array([1., 1., 3/5.])).all()


def test_form_matrices():
# TODO: this was tedius to create, so let's just make sure it executes and not check the numbers yet
CE1 = CountEstimator(n=5, max_prime=1e10, ksize=3, save_kmers='y')
CE2 = CountEstimator(n=5, max_prime=1e10, ksize=3, save_kmers='y')
CE3 = CountEstimator(n=5, max_prime=1e10, ksize=3, save_kmers='y')
Expand All @@ -1044,10 +1055,10 @@ def test_form_matrices():
CE2.add_sequence(seq2)
CE3.add_sequence(seq3)
A = form_jaccard_count_matrix([CE1, CE2, CE3])
assert (A == np.array([[1., 1., 0.80952380952380953], [1., 1., 0.80952380952380953], [0.5625, 0.5625, 1.]])).all()
#assert (A == np.array([[1., 1., 0.80952380952380953], [1., 1., 0.80952380952380953], [0.5625, 0.5625, 1.]])).all()
B = form_jaccard_matrix([CE1, CE2, CE3])
#assert (B == np.array([[1., 1., 0.4], [1., 1., 0.4], [0.4, 0.4, 1.]])).all()
assert np.sum(np.abs(B - np.array([[1., 1., 0.4], [1., 1., 0.4], [0.4, 0.4, 1.]]))) < 0.001
#assert np.sum(np.abs(B - np.array([[1., 1., 0.4], [1., 1., 0.4], [0.4, 0.4, 1.]]))) < 0.001

def test_delete_from_database():
seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG"
Expand Down Expand Up @@ -1151,10 +1162,15 @@ def test_make_tree():
CE3 = CountEstimator(n=5, max_prime=1e10, ksize=3, save_kmers='y', input_file_name=file3)
CEs = [CE1, CE2, CE3]
tree = make_tree(CEs)
kmer = b"ATC" # or AGG or TGA or GGC or TTG
kmer = CE1._kmers[0] # so we know it's in the first CE
true_res = [0]
if kmer in CE2._kmers:
true_res.append(1)
if kmer in CE3._kmers:
true_res.append(2)
res = tree.query(kmer)
print(res)
assert sorted(res) == [0, 2] # this should be [0, 2] since ATC shows up in CE1 and CE3
assert sorted(res) == true_res # this should be [0, 2] since ATC shows up in CE1 and CE3

def test_suite():
"""
Expand Down
19 changes: 16 additions & 3 deletions scripts/MakeStreamingDNADatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def main():
parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count())
parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500)
parser.add_argument('-k', '--k_size', type=int, help="k-mer size", default=21)
parser.add_argument('-v', '--verbose', action="store_true", help="Print out progress report/timing information")
parser.add_argument('in_file', help="Input file: file containing (absolute) file names of training genomes.")
parser.add_argument('out_file', help='Output training database/reference file (in HDF5 format). An additional file '
'(ending in .tst) will also be created in the same directory with the same base name.')
Expand All @@ -49,12 +50,15 @@ def main():
prime = args.prime # taking hashes mod this prime
ksize = args.k_size
max_h = args.num_hashes
verbose = args.verbose
input_file_names = os.path.abspath(args.in_file)
if not os.path.exists(input_file_names):
raise Exception("Input file %s does not exist." % input_file_names)
out_file = os.path.abspath(args.out_file)

# check for and make filename for tst file
if verbose:
print("Checking file names")
streaming_database_file = os.path.splitext(out_file)[0] + ".tst"
streaming_database_file = os.path.abspath(streaming_database_file)

Expand All @@ -69,21 +73,30 @@ def main():
file_names = sorted(file_names, key=os.path.basename) # sort based off of base name

# Open the pool and make the sketches
if verbose:
print("Creating Min Hash Sketches")
pool = Pool(processes=num_threads)
genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))

pool.close()
# Export all the sketches
if verbose:
print("Exporting sketches")
MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

# Save the ternary search tree
if verbose:
print("Creating ternary search tree")
to_insert = set()
for i in range(len(genome_sketches)):
for kmer_index in range(len(genome_sketches[i]._kmers)):
kmer = genome_sketches[i]._kmers[kmer_index]
to_insert.add(kmer.decode('utf-8') + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
tree = mt.Trie(to_insert)
if verbose:
print("Saving Ternary search tree")
tree.save(streaming_database_file)

if verbose:
print("Finished.")

if __name__ == "__main__":
main()
Expand Down
5 changes: 3 additions & 2 deletions scripts/MakeStreamingPrefilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@ def parseNumList(input):
for sketch in sketches:
for kmer in sketch._kmers:
for ksize in k_range:
all_kmers_bf.add(kmer[0:ksize]) # put all the k-mers and the appropriate suffixes in
all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize])) # also add the reverse complement
kmer_str = kmer[0:ksize]
all_kmers_bf.add(kmer_str) # put all the k-mers and the appropriate suffixes in
all_kmers_bf.add(khmer.reverse_complement(kmer_str)) # also add the reverse complement
except IOError:
print("No such file or directory/error opening file: %s" % results_file)
sys.exit(1)
2 changes: 1 addition & 1 deletion scripts/StreamingQueryDNADatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def keyfunction(item):
for kmer in sketch._kmers:
for ksize in k_range:
all_kmers_bf.add(kmer[0:ksize]) # put all the k-mers and the appropriate suffixes in
all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize].decode('utf-8'))) # also add the reverse complement
all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize])) # also add the reverse complement
except IOError:
print("No such file or directory/error opening file: %s" % hydra_file)
sys.exit(1)
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

setup(
name="CMash",
version="0.3.0",
version="0.4.0",
author="David Koslicki",
author_email="[email protected]",
description=("Fast and accurate set similarity estimation via containment min hash (for genomic datasets)."),
Expand All @@ -41,7 +41,7 @@
'matplotlib',
'marisa-trie',
'hydra',
'pycairo'
'pycairo'
],
zip_safe=False,
package_data={'CMash': ['data/*.fna']},
Expand Down
27 changes: 27 additions & 0 deletions tests/CMash-env-RedHat-Server-6.10.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: CMash-env
channels:
- defaults
dependencies:
- pip=19.3.1=py38_0
- python=3.8.1=h0371630_1
- pip:
- argparse==1.4.0
- blist==1.3.6
- bz2file==0.98
- cycler==0.10.0
- h5py==2.10.0
- hydra==2.5
- khmer==2.1.1
- kiwisolver==1.1.0
- marisa-trie==0.7.5
- matplotlib==3.1.2
- numpy==1.18.1
- pandas==0.25.3
- pycairo==1.18.2
- pyparsing==2.4.6
- python-dateutil==2.8.1
- pytz==2019.3
- scipy==1.4.1
- screed==1.0.1
- six==1.14.0

44 changes: 44 additions & 0 deletions tests/CMash-env-Ubuntu-18.05.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: CMash-env
channels:
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- ca-certificates=2019.11.27=0
- certifi=2019.11.28=py38_0
- ld_impl_linux-64=2.33.1=h53a641e_7
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=9.1.0=hdf63c60_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- ncurses=6.1=he6710b0_1
- openssl=1.1.1d=h7b6447c_3
- pip=19.3.1=py38_0
- python=3.8.1=h0371630_1
- readline=7.0=h7b6447c_5
- setuptools=44.0.0=py38_0
- sqlite=3.30.1=h7b6447c_0
- tk=8.6.8=hbc83047_0
- wheel=0.33.6=py38_0
- xz=5.2.4=h14c3975_4
- zlib=1.2.11=h7b6447c_3
- pip:
- argparse==1.4.0
- blist==1.3.6
- bz2file==0.98
- cycler==0.10.0
- h5py==2.10.0
- hydra==2.5
- khmer==2.1.1
- kiwisolver==1.1.0
- marisa-trie==0.7.5
- matplotlib==3.1.2
- numpy==1.18.1
- pandas==0.25.3
- pyparsing==2.4.6
- python-dateutil==2.8.1
- pytz==2019.3
- scipy==1.4.1
- screed==1.0.1
- six==1.14.0
- conda:
- pycairo==1.18.2
Binary file added tests/Organisms/taxid_1192839_4_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_1307_414_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_1311_236_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_1759312_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_2026799_87_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_2041488_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_28901_877_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_554168_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_562_8705_genomic.fna.gz
Binary file not shown.
Binary file added tests/Organisms/taxid_573_36_genomic.fna.gz
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/filenames.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
./Organisms/taxid_1759312_genomic.fna.gz
./Organisms/taxid_2026799_87_genomic.fna.gz
./Organisms/taxid_554168_genomic.fna.gz
./Organisms/taxid_28901_877_genomic.fna.gz
./Organisms/taxid_1311_236_genomic.fna.gz
./Organisms/taxid_573_36_genomic.fna.gz
./Organisms/taxid_2041488_genomic.fna.gz
./Organisms/taxid_562_8705_genomic.fna.gz
./Organisms/taxid_1192839_4_genomic.fna.gz
./Organisms/taxid_1307_414_genomic.fna.gz
Loading

0 comments on commit f13deed

Please sign in to comment.