From 699820306e0b80d004d92b2757e2169ecca26632 Mon Sep 17 00:00:00 2001 From: James Gilbert Date: Tue, 29 Oct 2024 16:52:37 +0000 Subject: [PATCH] FASTA indexing --- src/tola/fasta/__init__.py | 0 src/tola/fasta/index.py | 158 ++++++++ tests/fasta/test.fa | 740 +++++++++++++++++++++++++++++++++++++ tests/fasta/test.fa.fai | 100 +++++ tests/fasta_test.py | 24 ++ tests/fragment_test.py | 1 + 6 files changed, 1023 insertions(+) create mode 100644 src/tola/fasta/__init__.py create mode 100644 src/tola/fasta/index.py create mode 100644 tests/fasta/test.fa create mode 100644 tests/fasta/test.fa.fai create mode 100644 tests/fasta_test.py diff --git a/src/tola/fasta/__init__.py b/src/tola/fasta/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tola/fasta/index.py b/src/tola/fasta/index.py new file mode 100644 index 0000000..e20408a --- /dev/null +++ b/src/tola/fasta/index.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 + +import io +import re +import sys +from pathlib import Path + + +class FastaInfo: + __slots__ = ( + "name", + "length", + "file_offset", + "residues_per_line", + "max_line_length", + "seq_regions", + ) + + def __init__( + self, + name, + length, + file_offset, + residues_per_line, + max_line_length, + seq_regions=None, + ): + self.name = name + self.length = length + self.file_offset = file_offset + self.residues_per_line = residues_per_line + self.max_line_length = max_line_length + self.seq_regions = seq_regions + + def fai_row(self): + """Returns a row for a Fasta Index (.fai) file.""" + numbers = "\t".join( + str(x) + for x in ( + self.length, + self.file_offset, + self.residues_per_line, + self.max_line_length, + ) + ) + return f"{self.name}\t{numbers}\n" + + def regions(self): + s = io.StringIO() + for start, end in self.seq_regions: + s.write(f"{end - start + 1:14,d} {self.name}:{start}-{end}\n") + + return s.getvalue() + + +def index_fasta_bytes(file: Path, buffer_size: int = 10e6): + name = None + seq_length = None + file_offset = None + residues_per_line = None + region_start = None + region_end = None + seq_regions = None + line_end_bytes = None + seq_buffer = io.BytesIO() + + info = [] + + # Opening the file in bytes mode means that Windows ("\r\n") or UNIX + # ("\n") line endings are preserved. It is also about 10% faster than + # decoding to UTF-8. + with file.open("rb") as fh: + for line in fh: + # ord(">") == 62 + if line[0] == 62: + # If this isn't the first sequence in the file, store the + # accumulated data from the previous sequence. + if name: + if region_end: + seq_regions.append((region_start + 1, region_end)) + info.append( + FastaInfo( + name, + seq_length, + file_offset, + residues_per_line, + residues_per_line + line_end_bytes, + seq_regions, + ) + ) + + # Get new name by splitting on whitespace beyond the first + # character and taking the first element of the array. This + # also allows space characters following the ">" character of + # the header. + name = line[1:].split()[0].decode("utf8") + if not name: + msg = f"Failed to parse sequence name from line:\n{line}" + raise ValueError(msg) + + # Reset variables for new sequence + seq_length = 0 + residues_per_line = 0 + region_start = 0 + region_end = None + seq_regions = [] + + + # The first residue of the sequence will be where the file + # pointer now is. + file_offset = fh.tell() + + # We assume each sequence entry will have the same line + # endings. Check for Windows "\r\n" line ending where the + # second to last byte will be ord("\r") == 13 + line_end_bytes = 2 if line[-2] == 13 else 1 + else: + residues = len(line) - line_end_bytes + if residues > residues_per_line: + residues_per_line = residues + + # Treat any non-ACGT character as an "N" (i.e. gap) + for m in re.finditer(rb"[ACGTacgt]+", line[:-line_end_bytes]): + start = seq_length + m.start() + end = seq_length + m.end() + if start == region_end: + region_end = end + else: + if region_end: + seq_regions.append((region_start + 1, region_end)) + region_start = start + region_end = end + + seq_length += residues + if name: + if region_end: + seq_regions.append((region_start + 1, region_end)) + info.append( + FastaInfo( + name, + seq_length, + file_offset, + residues_per_line, + residues_per_line + line_end_bytes, + seq_regions, + ) + ) + + return info + + +if __name__ == "__main__": + for file in sys.argv[1:]: + info = index_fasta_bytes(Path(file)) + for fst in info: + sys.stdout.write("\n") + sys.stdout.write(fst.fai_row()) + sys.stdout.write(fst.regions()) diff --git a/tests/fasta/test.fa b/tests/fasta/test.fa new file mode 100644 index 0000000..740b9b2 --- /dev/null +++ b/tests/fasta/test.fa @@ -0,0 +1,740 @@ +>RAND-001 +nnntgccgttacttctcacggtgacaggctctcaggatccgcccaccctctgctcccaga +gagcaagttacttgaccccacaggcagcgtacacagctaagagagatgatgccagaaaca +cgttccacagggaagcgcaaatcgtcaggagactgtgttcgggcacggagaaannn +>RAND-002 +cgctatcatcttggcacgaaagtatgctagtcacttaagggcgacccgagactattgccc +gatgaaatgtagtntatttacttcgaatacgttacgtccctccttactgggcctatcgnn +nntgcccgataattcctgtagagaacctcagtaattaagggacctttcgaagacccgggt +acagctacgccncgacagncaaggattgcaaacgataagcataatgacaatagtgacctc +ctacccctccatgctttacgattatctgggaactgccatatatcggacctaattaggcgg +tagaagcacatgctacagtccctcacccggaatgaaacccgttgggggaagctat +>RAND-003 +gcggcccggg +atcttgagca +ctgaaaacct +tctctttccg +acccagacaa +g +>RAND-004 +gcgtcgcagtgttctggccaagatcaggcggtaaagtcagtcgcggactc +cagatccaagttcaanacactgtacacagtggctcgctagctttccatgt +agtagaattgctacacaatcatggcgactcaagccctgttgagccctggt +gagacgtactgcccttgtcataagtgccctcgttgcaaatttaaacaagc +agggggcttagaataagccgcatatgtaattatatgttacgtacatggac +tcgtttacgaaattaaaggttaaaagtcttaattcaaaacgcccaagctg +gtcaacaatagggaggatgaagcgtcaaattggcaatgttgcacccccta +accttctgcgtgggcatgcacgaagggcaatcgactgaagccagtcggta +gtg +>RAND-005 +tagggatttttagtgagcgttaataatttgttcatggcccgcnactacgc +attctcccatttangaccgaagtatctgcctgggagcttcgcgcattgta +gtatgggtagtctagaaccggtcgtcccaagtctcag +>RAND-006 +gcgaaaacctactcccttcattagatccgtaccagcgacatatcttaaga +ctaacgaataagaccctaggggagcagcactttttcatga +>RAND-007 +gaacctggacgtggcgtaacccagcnccggctacataccgtagtcggacgaccatcacaa +cctcccagaagctgataaagcatctttccntaggcccggtgangtaacanttacgggtgc +gcaagcgcaaaggctgttagtagtgctggtatgagagcggacccaattacctgggagggc +gtgggaagatctagttggaccatttgtctctgagtgctctggctagcttaagtgtcaaga +aag +>RAND-008 +tgattatctggcacgacggcatgtgtgtgggttcccgaatggagtaagctagacatagac +canggcattttaaanggagcaaggcttccaaccaagccactaagtatttgtttaggggaa +ggagcgaacttccaattcatctagatcctagctagtgcgtcgactgatgaaacctctttc +tcttgtgattcatcaagacacgaacctcgctataaagttagcagtggctcagaacgtcgc +ttttatgtgctggatccattttcgccatcttgctagtgctacgactgaagaatgcgtaga +taaacgagcatccagacaaagatcatcaagccaaacgctttcacgcggcgctggtcctct +gcgggc +>RAND-009 +cgccgcgtaaggttaaaaaatcggaaccccgcgtagttgaagcagcncgc +cttggcatgatactcgaaattggagggcttctgatacaaccagtaacctn +tgctttccttaacaattcctgcaggacgcacatcgtttaccgaacttccc +attttgcctagattgatcctgccggcacaaagggagaaaagcacggcggg +gagtcagatgcagaagaacctagccaaactgagtaacctttcgatttngt +cccacactcctggnatactctagatccnagtaagttaaacgtatttacga +atatcggacacaacgtgctgaattatacgttcatatgtcgtctagcttat +tgactgctacaggaaactgtaacaccgggtgcattccggtcagt +>RAND-010 +acagcaaccaagccgttgcactgctctggaatttagacataccatgcctaaccaagtccc +aggtcgacgcttaaatatccacctaggctaaagcgaggcttcgattacttctaaaacggg +cgcacaagtatccncgaaaaatcacgttcgtcaggatagcgggcgctactaaagctctat +tgtttctccggattacataagtctttactgtacagtataccacatcggcgctgagtcttt +aggctaggatccctttatggtccttcttgagtcgttgccctagcggagggatgttgggct +gggataaactgcactcgtctggggc +>RAND-011 +atgataagtnactgcatgctaaatcacatacggggtgtttgtcctgcccg +ccccctccagcctactactagacagtcggtctccgccaacacaggcggcc +tccgagtcccgngcaccactcccggtacagttcaaaaaccgctctaaaaa +cgggttgactcaggaaaacttccttcttaatctcgaagcgaaccgtttta +ggaaacttgtaggatttcgtagcaatcatcgtggaaccgatttaacccta +cggcacatccatttgatagtgacgggtcagcgttaagttggacacgaata +gggacggataagaggcccactttgaatctggaagctacggggaccgaagg +caaatcagccatggacaagtttgcgccagcacacntcacctcaggatgaa +gtccaaaaagttta +>RAND-012 +atcatcaaag +gccgcccctt +taataacccg +ttttgtccga +gggagtacgt +ggcacgagtg +cgggaa +>RAND-013 +ananctccactcgaagactgtggaggcgggctcgcccgccgcganaactttgtnagaatc +caaataagaacacagctgccacccatttgtgggtgatttgtggaagggtcgaaccgacca +tcgacttctgttgtctgggaatcttatttcaacggatcacatcggaacactacctccaac +ccgccacaatgatatgcttcaaaggcttcgcgcaatctcgagaagaactaggtcaatgaa +tgnattctcatacgatgcgttgaaacaatactcggctgaaaaccatccggagcatatttc +acgcggtgcaatagggttaagattgaaatttcaagcaaatctctcaatctcggcaacccc +gcgactaccgagaacaaggccctaccggcaaactacgaaactcgagcgccaaacgctgcg +caagtactaggcttcacctgtacccgttctaagctgtcacttcatctcgcacaattattt +ttacgctccaccccactacgcataccctacgnctaattcggctgctttttcgaagacaga +cggctatagatgtctctcggtggcactaggaatattcgacggttcgctatgttcatca +>RAND-014 +attcagtatcgtattcccaactataatttctgacgccggccnccattcgactgtctgtag +taatgcgaaggagaacacctgagagcacacgcaattatcataccaaagattacgctcaag +ggggaggtttctggctgttaggatggacgcgcactcttgagtgcttctgcgtaagagggg +tatctaacggcgtgttgcctagggaattctaaggggtctcgactcgtatcaacacggcgg +aagagtctagcgttagattcacccgttttcaaccgggccntacggcaggcgttcctatgg +cgaatccaggggaaagcacagatgatgcatcatttgtacagatacgccaatatcgcttcg +ggtaga +>RAND-015 +ccctccctat +ggacggtaca +ggcactcaaa +atatgcgagg +acggctgacc +gaggtctgaa +tcaaagaatg +tagccttgac +ctatggt +>RAND-016 +caccctttaaggacctctcagccctttcattcgatcctgctgcggtcgtgctaagctgtc +tacatcccgtcntccgcaggctcactatgnttgtcatctattgacatggacgggctgaga +>RAND-017 +tcagatgccgactggcgagtcaaaagattaccccttaatgcgcgctacggttttactaat +gactccgtggaacctncgccgaatcactcccatatcgtgtggggccgcaccacaggcctg +tgccaaatgcgacgtaaaggcgtcagctctgttgttcgggtttccagagcttctctcctg +gactatcgcacaattatctcgatagtagagtacaccgtccaaaccgttgtgagngctcca +gaaaggatctacatgtaccgcaaccctgcttcctactgccgctatttggcgctcagtggt +cagacaagtcatccatcctgcgcacgacgaggnctccgcngatgaagattcctcgtaatt +ctagttactcatacgtgcggactgagctgtcggtccaggntgtattgccaaccactcttg +gactatgtgcagggcggagtctgatgcatactactttctcagaaaacccctggcgtgaag +tccgatgcggtgccggtatgcatacccgancacctctgaagaaaacactacgtcgaggtc +aagtgcacattgagtgaattcctcgattgggat +>RAND-018 +tgaaatttat +aaggctgtgt +ctccttgact +gttgcantgg +cctaaattat +agt +>RAND-019 +gatgataggc +tgtccaacgc +gattcctctg +acattgcat +>RAND-020 +cggattgactccgataggctgccccagcgaaagtcctcctcctatagagccggcgtccgc +cccgctgggcggatgatacggagggtggtaagctggtgcccgttgcaattggccanaagc +gcattattcatcttgagagtatacgcacaaatacctcatgacagccggacggtgagtgaa +gcgcagttccaaatatccatcgtgctcatctttcagcgccgtccgggtaaacccatggaa +ccgagattcgaaccctgccaccattctccccacgttgtgtgcagtttcgtggcggtattg +ggggctgaaataggcacgcacagcatttctaaaggcaaggaatcggacgttccgcgacat +ataccgggtaaatccttaggaacatcagattaaattcggagaggacacctcatcaaaaac +gcttacgcagcctctgacagatctatgcctacacacacgataaactgctatgacataant +ttgcctagggtcccgcttctnaatttacaagcac +>RAND-021 +cactctctgc +cttcgctca +>RAND-022 +ctagttttgg +tcgacccgat +gctacgcgg +>RAND-023 +cgtgacgatcctacagtttttggtgcagtagtagaacttgagctttgtac +ccgagtacccctgtcacatgagtcacagccggctaacgaagcaatcatca +tatacttccaacgccactcaacgtcagcttctgtgaggaaagatgaccaa +tccccgccacttgcatacacctccggtgttccgaactaatggagctctcn +gtgaaccagcgatgtcttctgaagatctatactcatccaacagtaagaaa +gtcgtcaatgacccataggtgttttcccgtcagtactaggaaccgtggtg +acnaaaccgaccgagctctttagtgcggaccaggaaccttgcccagtaca +ctatcaagtactgtactattcgtcganccgtgacacagactttgattctt +gcgagtatggg +>RAND-024 +acctatttgc +aactttgtgn +tactgatcgg +gccgggggaa +cg +>RAND-025 +atgtacaaaaggaaataaatatgngacatggctgttgcctcagttagccaccccaatgtt +cgaactaaaagatgccctccggtaataggcggangttgaaaggaaggcccatcatactat +aggggcgtggcggggcttttataacccacgacccaagtgggtcgcagggagagctcccac +catcgttcttcttcgcggcagaacgagacgccgttcaattccttgcatctccanacgatc +ccagattctaccttgtacagactacgtcgagcggcccggggctcaccgatggacacatta +ttactaagcgaaattgaccaacctgtgcggctctgatcaagcggatcggcctttaagtca +attgaactgcgtgaatcggccaaggtctgnatcgtactcatactagagggactgccatcc +gaagaacacacgcgtccatgtaacacaggaaatgccgcagcggaggcccactgcttggga +aaagcgcggtagaagttgtggagangtctgaatgaagcgagtcatcgatagttcgttgag +ccgctnccgccatcggttgttgtcggaggggcataagatggc +>RAND-026 +gagcatccgt +gtcatctaca +ggtacgtcac +attggacctc +ttggctcgcc +actgccaacg +tgagcaacac +caatatgtgg +tcgctaatca +gcccc +>RAND-027 +accacttagagnatcatatcgagtttgaatccccccaccggtctggtggtgttcagcgtc +gacgcatactctatctcctcttcatcagctcagtaagcgtgtgctactcaagctgggcgg +taaatcgttgggggtgactgtgcttncagagggggtccactccgttttgcctacagtccc +ttacagaaacggtgctacgcccaaagacttccaccgactgggcgttgttgacttttacag +caggtctctggggtgtctaggaggccaattccctagttagtaggttttagcgttcgcttg +aattcgtccgattcggggagatgccaaccgcgacgccaccacgtcaagagccacaatgtc +cgattacgtctatagtttntcgatacgaagcccaggttacacgcatgcgtacttnacctt +attttactgcattaatcgaacacggagtgccctgtgttctaaccagcttcggcgggtact +atatgtattatacgatagcataagacaggatgaataactccgccgtcctcgcggcaaaac +ttgggggccagcggctaatcctagcaacaatgaggggataattcca +>RAND-028 +ttgttcagacactgctgagctctcgcggnatgatactgaggcatagctttagtgttataa +ctagttgcctacaccccggggctgctgacaatgaacnctctaatccgcttcgatcgggcg +ctaggtcaggtttctagtttgaaacaggactcgaatgttcacttcatgtagtgtagcgga +cagtcttgaatctgtctccacactacggacgaaaactaggtcacctaaactcatagcccn +cttgagttgttagggcgacattaaacagcctgnccagggcgncgttctcgtcatcaggat +agctgncaaatctacagaagaagagtatgttgtgttcccataatttntcactgatcactg +ctattcgcgtcccggtaagggagaaattggcaaccgcggcacaaaaattaatagaaagct +tgttactgattcgtttgttactctcatgncgctcantgccgcactcaaaattgcattcct +catctatcaccacacagccacaatttaaagctttacgtcggctgtcccattctctcactg +ncgacctttcatctcgagccaggtccttactgcatctgagctcccgtagg +>RAND-029 +tctggcgttgattagcgcaacatctgggtgctagaagtgtaaccgattctttaccggtag +cgtgcttgagangtaagcaggatc +>RAND-030 +aaccggttcttgcttaagacgtgctctccttgcgaacgcaaaaagacgttcaaatgcatc +tatgtaggggtcttctagctactcatcatactccctcatgatcccaggggacggtttggc +ctgagaagggtcggcagaaagaatatatgcccacattcgtcccgtctgcggcagggccta +aacgaacgacgccttaatgcgcaagatgaaagccccgcgngctcaggtcggttgtatatg +ttgtaagtgaagttgacaatagattgtaacgtctgttatctgcagcttttacctagggtt +cgaggattggagctgctgacaagcgtcttngatgaatggcgtcataatacacagactctc +acgagaggaatcttagggtcgaccgactgtcggcgggaccaccggcgcaggacttgta +>RAND-031 +acgggagcatctgctaggtctatcacggtcccacacaatttgtatacgac +gcattgtcgaaagtctgcnggtaagtcctgtcacntgatcctccgcaatg +ccacaatcggctacacaggctagggaagccaatgtgacttgggctaccgt +atggaaaatcggttcctacagcaccagctgacatcccagtcacatataca +cccttaggaaatggtgtgataacgcgcacggccagtgccgtgtcctc +>RAND-032 +taaggtagnggtggcctcttctacccccggagtgagcccgcatgcggaatgatgctggtg +ctggaaccataatggtctatgcaaggctgaaacaacgcaatgggcgccataactcagcac +atcaatagcgagggttttgaaacaccaccgagcggaccaggccacacgatgccatatcgt +gtatgataactctttcactacttgaaccacgcgcgtatgacaaanccccatcctaccagt +gcgaattgcgacacattctctcgcgatcggatacgttgcgcaatagggaaggttagtccg +caaggcgcagcgcactagaccggggtgaatgcctcgacgcacgagacgtntcgtgagcgg +ccttcgagaggtgccnagtactagcgatgaaggcctacacaggtgtctngatgatggtgg +gacagagcgtcgtggctatgcgggtgcttaatgacgggcgactagccaccatcccggttc +gaagcaaannttcaccctcccatcaaccc +>RAND-033 +taatgcctcacagctcacaaatgaaggccatatacatgctcgacggatatgcntggtttt +cacgataacctctgagtcagcatcaagtagtccatagtttcttgtgtacaaacgaaagtc +aggcccctacgctgcaagttagccgctctcgaggtcgccagttcaagtggtgtgtgcggg +tgcttggtgacccttcagccacgctcacctgcaaacacccgctcagtctcctacgcgata +tagataaagattcataagggacaacggccagtgaaatacgagtncacaagtcttcaattg +cgtactataaaccggacgaaggccgccnttatactaatgtcggttcaatttttggtcctc +gtctttgattcatggcggcacgaccctcntaacctttccccgactacgaacgtatccctc +agctagtatgaccccctntgagatgactcgagattaaggacttcccgaccaatataccct +atgctagcgtagatgctacttcang +>RAND-034 +gagtggcggatcccgcgggctccgtttgtccattaaatactctactcccataggtgntca +cggccagacnaaggaagggcgcaacctaccgcgttccaggttatctacccgtgggggtaa +aataacgcccgtgttcggaggcactgcgatatcggaatacaaactgaggcttagtctggt +gtgacaggtatcgtggtccaccgtgtggaagcgggttccactgttagctacatacgactc +gtaatgcgcaagctggcgttctcatctgtgtccccatcagtcngtagctaaagagaaatg +agaatatggcgcgttggtggaggaccnattcctctcagacgctgatgagaggctttggca +tctaataaaggagtcgtagtgcggtgggatggaagctacggggctataactgtgcataag +caccctagctgggcgctaatacattgtctaggtgctctatcagtatagtccgagaagggg +tggctcagagcggtctatactacgaacacaaaaggtcatcagtggagctcgatatccgca +tagcgaccgttagggtgcgcaagactagtcnggtcaggatacc +>RAND-035 +gtagtgcggctactagagtccggcacataaattgtcatcgaaaagtcagtttcnattcga +ggagaaggcgatgccggtgttgctgccattgtattggtagacnggagaatcccgcaaagg +atccggactgaatatttctcagttcggg +>RAND-036 +gaagtcttgttttaacgaagtttgaatagaggtactggtgggttagctga +caaagttta +>RAND-037 +cctgacacatggcctnagtcactaaccaaccgacgatcctaagtttcacagtcagtttat +cgctctgggccaaacgggatcttacgaaaggccgttcacgatagcacangtgagtaaccg +gttagatcataggtaagtacaa +>RAND-038 +tcgccaactgcgcccagagctacaagataatttaaagagttccctctccagcgaaccatt +cgcgatccatggatgtgacactgcccacaagacactggctctcgttactacctngtcgtt +agcatgtaccatggaatcgttatggccagcttggcacgcattgatccgttnaccgcatcg +gcaagagaacagctttcttatgttgccgagagaataatctaaaccgccttcccgaggatg +gtaaccatccctaagggagctttacatcccgagaaagtcttagcgaccctcaggtgccac +agtaaaaattaaaacggcgtcataggctatcaatggaaaaacttaaaacattcgcccgat +tattcatccccctagtcgcttttaccacaccgacagcgaagacgcgcgcctacctcccgg +ccactgaaggatgtacact +>RAND-039 +tcgagtccttgcgccacgcacatgttttggctgccgctagacagcggtagttcactgcac +acgcaagtggcagtaggcaagattccacattgctgttttcggctcttattccgtaacatc +gcactgtaggtagatacggtactaccacccatctttatatncggttgcaaaagcgggaca +aactagaancctaggtattgtgcggttacccctggccaatataagtatagccaagagtga +cttggccctatagtttaagactgctggataggtaaatgcgtggtaacctgatgccgtttg +caaggccaacgggaggnttgtcaagcattaaccaaatcggctgtaacaactgggaaattc +cgtccaacatagggagcaggggattgcaacatgtggtatcaaatatgtnacactagacac +cgttggcccaccacttatataataaactcgagcgatgactcaacctcttgtactgatgag +tgccttttatcttctcgcgtaactgac +>RAND-040 +ggtagcggtc +catgctccat +tccataagta +ggaatgccat +cccgcggcac +ccgctgggtc +acc +>RAND-041 +ccccctcttntcgcaaagttaaacgtttccgtgcagcagtcattattatg +gtgttgccgcatagaaggacggggggcaatcgtaagcaaattctgagatt +tacgaccacggatctaataccctgtcttctagcccccagtggtttgactg +ncttaaacacttggggcagacacggntgtgtgattctcntaatagaactt +ggctccctccgtggcttcaaaaaatttcgcgggtaatggtaatatcagcg +agcattttatcgaagcctttgtggcaattnctcccatatccgacccagcc +cattaacntgcaacacatcattaagcggattgtccgcggttcatgtgtat +gggctcaccggcaataaacg +>RAND-042 +gtggcatagcacgagngntaaaaatagaagacgcatatgcacttaagagatggtagccat +acctcaattccttcagccggttggaagtggcngctgagcgcacgggcccgcgaactcagg +agtcggtcttgctcaaggtaccccggtccatttcgaagtacggcagacgcacgtacgtta +taggggcatctttgcgcgcacatactacgtcacgtattaacaggaaataatagagggttg +agtaaagacgttaccatcatcaccacggtcgtagaccgaagcaatccagagcagtgcatc +accggnagtctttggagccgctttattgtgcctaccagtacaaggtgtcccatacgggat +tgaatacccgtgcagttatttgctatctttagctcattctggggagcggtcctcattttg +attccacagagatagtgccc +>RAND-043 +gaatnactttagcaactacatgtttacgagagtcaagcgatgactaagtgagcccggaca +agggatatgaactaggggagttacccggcatataatcctgacctcgcgtgtgcctgtccc +gttgaggggaataactgtctttgttgggatttttggcgctgcatccggagtatctcccgc +atcctgtgtacgttcttaacctttataagcgccaaacagctagcgcacctctttgggata +aaacatgtgcacattggttcgttatctggtcggcgctcgctgggcgcaaacttttaacag +cgcagctttatggacaggagtatgtgtttcggttgatagttcggcctaaccctgagcttc +tggactccggaagtacctcagtgattataggaagctcccgtaggtttccggcacccggaa +cagagagaaatgtttgtactaatttcgctgtatgtaacaatgttacggctatcttnaccg +cttttggagaatgaacagtcccctcataacaagtatgcaagcgctggaagtaaccggttg +agtaaacttgacgaaggcagc +>RAND-044 +attccnggggtgccgtcgatcgattagtttctaagctggtcacgtacatc +gacagaccgtgttgataaaccatataatccgtgaaaagaaccgaacacgt +gggggactagttctggacaaaagggcctgaattcctttgtcttaaatcgc +tctagaaatacgcatccctcttccttgcgcaagtacgtctaactagatga +tgtgtatcttctatcgtcgcttgtgcccctccactctcaccacggagagg +cttaccgactccctataacttcggccagccgaaccggga +>RAND-045 +tcagtatagccactaacttcccaggtgcgcgtactggtgtatgcctagctgagcactgag +gattaactggctgcgtttgctgggtaaacatacgatctaagcgaggnatgcaacgtgtg +>RAND-046 +ggtgcaacgt +ggtgagat +>RAND-047 +caggtgacac +cccaac +>RAND-048 +cgactatccggcagagcctgagtacttttctaaaagaaaccaatagatcgattaaaatac +ccgcattttccctttctgggagagtttaacggttangtagtccgtgtgggatngatcaca +ccatcccatgatccagagcaattccgcggtgatcacggaccccaaccaatacctgtagtc +catataattaccatcgaaacatcccttaggcggatttcctaagnacctgattagccggac +ctagc +>RAND-049 +gcgacaggtc +gtgcatggac +gcgttccagt +cacacaatga +ancctgcggc +acgagcgtcg +catgtctctc +>RAND-050 +gtagtattta +acggtgccaa +tgaagagggc +ctagat +>RAND-051 +ttcgcctcttggcgctatcggcagacacgacacttacttagaacttcgat +cgtgatgatacggattgattgacggtaacatgccctattgccttccgtac +cgtaggtgttgtctacgntcntggacagggccaacggcaggaacacaaat +aacgatnattaatatgatatgacatgtcattttaaagagtaaaaaagccc +ctggccggccgagtattctntattaatatttataaccagcgcggttaagc +ggccgttttggtgttcaggttaaaatttcggaacagcggacaacgtaaag +tcctatgagagaantcngggagagacgttcgattaatgcctccagnaatg +catttgggagcctttcgaatcgccatagtcaatgttaattacccacgggt +cgtgcagcccaagggcgc +>RAND-052 +tatattgggatacaaagcccacacaggagcaagccctgaatggtcaaatg +agacgcagtcgaatccaatgccgatcctttgctgcgttttccgtaaaatt +gtcgcccatatcgtcttactgaatctccgaaagtatggtacgaggttgtg +agaagttttagtctcgccaccgacgcgtgatataagtgaaggagctaacg +aatggcgaccgaaggaatctccacagcgcgttccgcacaatgtagttggg +acttgctaagacgaagttgtggtgagttctcccaaactaattaccactat +gtgcgaacgtaactgacccagagcaagaacgcttggtcctagtaaaaggt +tcacaatgtttcagcctgatcctctcggcatacagtgagatggccaggat +caaaggtctccggactaggtgacgagcacgcgaatacatgtagctcttag +taccaangaga +>RAND-053 +actaacgtga +tagcgatgtg +tctcacaaaa +actcgtggcg +tcgcgcc +>RAND-054 +caataactgtatatgacaacgcctccacagggtcctcatgtctacgaggtctagttcgtg +cggtctaaacttaagccacacgcattacggaagaaagacctcctcactaccacgtctgcc +gcggtcagggtactttccccacttctacgaagtatcctactgggcagaagtaactctcgg +catggaggcaagacttccgttcggcttttatggttaatgggatagaattgctattattgg +gcgctctccttgtatgtaaagggaggagcttgacggccacagcatagaggtccctcgcaa +gaatcacggtattngcacaacgcggaacnccccgaaatcgactactttcttgtctcccta +gnccggccggtcgagatgtaaattcgcaagacgttgttgcaaatccccctgatgagaggt +gagcgactgcaaaacaaagcacgccaagtcncnc +>RAND-055 +ctgggccggataagcggatatgcccacacacggagcccgcgcctgataggaatgtcgcta +gtctagactgtgcataggacatgtaccgaaatggctagcatccctctaagcggggcgtca +gttgctttaacgaggtatccgagggaaaccacctgcgtcgcgcgaaagggatttcngttt +tagacgcgacatagaagaggcatagaggcgtcctggatatctggccgactcaggcgcccg +cgctaggactagactatagctaagcgtgggtaaacatgtaccaactctagtctgaaagcc +ttgtggtgacganattatgaacctgagactcttacgctagaaagtttcangccggganac +cntgagaggaattctggcgcgcctacttgatcaacaactgccntactgctcgtagaacct +atgggttcctcacacggaccgacagtcttatcactgacttctatacgtntactgccttca +caaagcctccgtaatagaatcctnaagactgtaat +>RAND-056 +catcgggcattttggtgcattgacagacgggtcaccggtacgtttcatgtgtaagtgttt +aggttgtacgcggaactttcgcaagaacagtcgatcgtaaataagcgtatcgtagttacc +cattatccactgaccgcgtgtagggtatgttcgtggtaanccttaagtattctcaagtta +atgccggcagtccgggatttcctcccactcggcaccacggcagctaccgttctagatgac +ggggggcgcgtcaaacttgctacccaccgtgagtgggtgatgagtccatgtacacccctg +aagttaggattaagatgaagctgaaagccgaagccngccccttccnaaactttcccaacg +gcaatgctggtggatgtggtgtcacggtgtgatactacaaacacgctctgggccacgctt +ctacgcggcctggagaaagctatcgg +>RAND-057 +acaccgtaatcaagtttaaggtcgggcagaagaattctttgacctcgaacaaggtgatgc +gagtttcgtgtgataatgaaattgctcacggcgcacgacgtggcgtaacagagtctaatg +acgngcatggaccggtgccaagctagaactgtgataaaattagnaccgcttccggttgcg +gacgggcccgatatatatttaagttgtgtctggacccgacaacgtcggcttgtcttgata +attttgccctaacccgccagggacggatttctataggcagttccaactttaacctattac +ntaacaatgttaggcactagacaa +>RAND-058 +ggtcggcgag +ccttttgtgc +ctcctaa +>RAND-059 +aggttctttggcctgcgagttgcgagtaacacgtgatctctattgttcacccccgtgttc +ttcagtcggagaaatcacctcgaaagaagatagccgccaagtatgggtctagagccctac +ctgggaaccttataaagtaagagactcancacggtctaatgggcgattgtacacaattat +cgattgggccgttactgttgttgatgtgagccacgccggggctagtcgctggtgaaatgt +agtttggagctggtcgatcgaacgcacagaggtccacggcgtggatttttgtgaaag +>RAND-060 +tgactccctc +gatgcgnaag +acatgcctcc +tagcttcctt +tcattccacg +cgattatttc +cgttctctcg +gccgcacctc +gcatccgtgc +agntgacact +>RAND-061 +ttaaaaccaa +>RAND-062 +tgcagcttga +ctt +>RAND-063 +aaccatattn +gaacgcgtgg +ctcggactaa +tgacaggaat +ggcccttgcg +ccgcataact +cggacgtgcc +cgagattnac +cctgcggtct +ctttncc +>RAND-064 +gccaaagagtgttgtggtctcagttctggtagcatttgacggatcagtctcagtgnacat +gcagcacttcgacagagtgtggtctgggtctgttagtgaaccgtgtcgcaggcattcgac +cgcgtgggacttgcagtggtggtccctaagcagaactttccaaactaagcacagttgtct +gagcagccggggcctntgttaatgagancggacgttggagagggcgcgtcgacaa +>RAND-065 +ccagcggcgt +tatntgaagn +cggctccagt +gacagtggga +atggatcgtg +taagcntaac +ttaagacccg +gcgcgtcctc +>RAND-066 +actatttcat +cggccaanac +acatgctgat +tccacagaac +gactcntcag +gcccctaaat +ggttatatcc +taaggataca +tgttttagta +tg +>RAND-067 +cgcgtactacattagggaagtgatcggcaacaagatacttagtccgctggggtccctcat +aacgacggtgatagacgtgaccctcaggccggccctgcggggtgtcatcgcagggaaaca +gtaatacattaacccatgaaggtactaaacggagtgcaattgtctaaaccgctaattaca +tactactcgaagcaaataaaggttatgagatctagtgtccttgtaacacgagttatgacc +acgccgtcgtccaagattctgctattctatct +>RAND-068 +ccccccatac +aagagacgga +tgtcacccta +tttcgtactc +tgtttacgat +caaaaaggcg +gcgtggtcgt +cggtagccat +aaagctgccg +act +>RAND-069 +gttacacaccctatgtatgagggcgcgtgcagccgggggtaccctcattc +atcgccttcttttgcatgctggaagtgacttacggctctagtcgtttgtg +cgcgagcgcgacagatcaattaccgtaggtngaggatgagattcattggt +cggtttgcctagtgagaatcctcaggtggagccgtactaatgcacgtgct +c +>RAND-070 +gtcaatctac +gaacctgggg +ccaggaccca +ctaggtggtn +gataaattta +aaacaattaa +ttaacccagt +ctgtcgcctg +cttcaaag +>RAND-071 +taaaactatt +ccggccgcaa +aaaccggtcc +aaccgatggt +aa +>RAND-072 +tcgcggtcgagcagcttctcctgtatcagggaatacatgacattcgtcgg +gaattgtcaagctacatcgctaacgacatcacgagatccagcctaccgaa +ccttactctggaaataatgaataataatcttcagtgctcaggcctcgata +ataagagattcacggcacgtgagcgagctagctctttgtcagtaggtggc +atctatgatgctttcgcatccagaaactcgacgtaactcgccacgtccta +gaacgctcttccaaattggtttgtttcgtaacggccagatcagattagga +gcgcatcacagtattcacactggacccaaccctcacttctggaaacctag +acttgcgttaattagtcctgagtccgatgtgnaaccaaactcaacagcca +aagcttcagcgatggcgctgataagcgttgcatgaagacggtagctatgg +aacactncacat +>RAND-073 +aacggtccagtttgtagcattggcctaagtccggcgactagataggtcca +gttatctccattgctaactatcagggcnnaggtatcttggtgaacgcaaa +gcgccacaatctggaacggaaatccaattgacccccatcaggcccagagt +ccgcactccccgggaggcgcactcttgacaaccgatgtaaaaattcctca +cgtccgcaagcgacgcgcgacgcctttttgctgctgccgtacaacgccac +agttggaagattaggcaccgatctcactacattcctgcacacccctcaag +ggtcggaacgcgaagcctcgtaattnaggatattagnacnttctaagctc +tacccaccatancgnattgtgggtccaagatgctcggcacttaaaggaac +tggcagatttggccgagcgaagttgccagaggtaacttagcatatggtgg +ctagaaactgccaagcagactnaccggcaaaagtcagtatgtgcgatagc +>RAND-074 +aaggccacctagaacctagggtaccttgtggttctcaaaaatgcgcgaattttcaaagac +acggacgagccttggcagtatgccatcagtaccttttgacgaattgtgggggtccctagg +ctgtttctctcagtacactcgaaatcgcagctctgtatacggcacgcccgaanacatgtt +tgttaagtagcgtctggccaggtgcggacgcctcg +>RAND-075 +catcgtgttaggccgcaagggaagataggtgcgtaantacggagacccgttcggtaatgc +gtttgcaagcgaatacaagaagagaagtcaaagctatccgtggagtccccaataccaacc +tacaggggtcatacgaagacacaaccactatggcttcctacgtctgcatgaatgacccgg +aacgatcccacactcatgtccctcactaggcgacggaaacanacaaatactccggatggt +tcaggcttctgctgcctgcctgccgttcgaattcatactttgacgagggatgcagggata +tcgcngggctcgccggcgtcataaaccttgccgcctgcggcctcttcactaagaggtaag +ctccgaccatgctcattcggttgttattctgtcgtagcagggag +>RAND-076 +ggtccttatactgttgccctccacgcaccctgagatgctgacacatggag +taagaagcctacacattggaagactgctcaatcactcataacgactaaaa +agggactcatggaagtagtacttttagtttgtcctttgcgggtccangat +ctagcgtgagggcagtgcatgcgacgttacagaacaattancctggtccc +attaggggaaaagttctcatgtataaacngaccgtacaccttcgcactaa +ggaattagcaggaactnaaaacggtgggatctcgagctgtgcgtgccact +atccgcac +>RAND-077 +gtttaaggtt +tngccctatc +tgatcttacc +ggaatcagcc +gacattagaa +ccatatgaac +acttcacggg +a +>RAND-078 +ccccgcagagcgttgtaaccctttctcaattggcctcgaccgcgaggctcctctctcata +gacaacctcataacagtacgnaaaagagacgaaatatacactacggtacgccccagcatc +tgttccacttccagaagggcatcgttggacaatattacattcttgccgagcacaaat +>RAND-079 +ttaagacgaaaccnttaaccagaagtatcccctaggcgcccgggttgaaa +tttaccaggaagcattgcggagggagtttctgtagttctcttcaaaaccg +tacaactggtacccaatcgctgagaagtcctgccggtgtacccagacgtc +aactgtgcggatttacgttgctgccgaagatttctatacgagtgagtcac +gaagtactcgacgcatgatagtagactgtgctcgttggtaccgaaaaagc +aatgaaatcaaaggcgacgagcgctacatgataaatcacagggatttggt +atccacccnatagggtcccccggtgaaatccgccgggtgtccctagatgc +accggggactaatgtgagcgactcagatgagagcaagattagtactcgcg +cgccgcttatgcataaatgcatactttcatacgctaacactggccaactc +cccgaaa +>RAND-080 +tggcaatgataaagatcccgattataaggggcacgtgtggcgtcgtctagtgaatagttc +cggggcaatcaagtagtttggggtcgtggtcacctatcgacccgcatagagtctgatccc +agcttgacgacgtaaggtcctcctcttagatacaattacctaatctcccgatggcatgtc +tgaactatggacggtaggagacgaccggtgaacgtctcaaggggggagcaggctttctgt +tactggagctatgtccaatgctggtaaatgcntagggtgcctgnccccgcgctctcta +>RAND-081 +tttttctggacatgtttcatgctaattgcatcggtatatttcataanctt +taaacctcgaacgcgtacnccgtcgaccaggtgaaaggggatgaagaacg +gctnaccatgcgtanaggggcactgacgctcacctcgaaatgttgtcctt +ttgtcataagacggctcagtttgttgaggaacttaacacctaaattgtat +ggttcttccgcaggttattccacggncgtggtatgccnctggcacgaggt +ccccgtgttcagtgcgtacgatgagcacatcgaaatcggggcgttccagt +atttatgccacgcccgcaggatggccatccctttaagcatccaaagtaca +gagcaggacagtttgggcangtaatctgtcatgcatgacattccagtgcg +actgtagtgacatgaga +>RAND-082 +agatgcgccattacgcccttctacggtagtccagacaagggtccggaatgatatacttcg +atcatagaagaataagcgggtaagcggatgtccacctagcaaaatcgtcacatcgcatgt +cctgttgactttggcagaggcgtccgacgggaagcgagccatatatgagtgtcacccata +gtgccagtgctagacagatggcgctccgganaaagcgcttccatgcctgtgttgtcgcgc +ctagggctacgtgacaattttttccatacattcgaccatggtctgtttagcctcctctct +agtcataaaagtagcccaacgcattacc +>RAND-083 +aacataaattgaggacgatccttacaaaccactgaaggcagggccacttggtgccgagaa +ggcgccatcatttctgctcgagaccctctccggaaccgggaccatcacaaggtggacggg +ccaagcagttgggcattcaagagtcccagggggaggggatctcttacaccgcggaatagg +tgacactcaaaactggctgaatatttgaatggcggaatgcaaattgatcgctatagtctt +cacgacctaaaactacgtatctgcgcatcgatcgacaacgaacccaaactcgagaaatat +ggggtttttgggccgactcatccccgcnctgatcaggttaatagacgatcttgagcggaa +gaataagccaagaggntccaggcatgggc +>RAND-084 +gggcaatccagagtgcacgccgtcgggaagcctgatactgagaccatcggtagtactctt +cacagtagtgagcacacctacttaaaaaaccctgtgtgaagttgagcccgacgtgaggcn +agcacgcctaacc +>RAND-085 +gaaccatcgttaaggatatttaagtccacgtccatgatctcgagcatgct +acagccccacctaggagcgataggattttggagtgggcacacgtaagtgt +aaatagaatatagaaatctggcgataccggggacctacctgttccgggnc +gcatgggccggcccatagagagactaacaattgccttccagctttacggg +gtaccggcctggcgtcactagggatcaagggctccttgactcttangggt +attcatccgagaggcacttttatcgagcagctgaagcctcatgggcaagc +tacgcgagttgcccatcactacagtatctgcct +>RAND-086 +ccgctgagggcganaatcgtcgatacggatatcacttattacccttaggcatctctatgc +ttagcccccgctatattttttgccgttcccagcgtacgaattactgcggcgtttatcgtt +tagcaaggcgagtatgattctgagcgggtccttcgacncactcgtccggaccaaacgtcg +tagttaacgcccaaactacgtatcacagcaaggttcgcaccagtgtacctagtgcttata +atcaaggtcgtcgtcgngc +>RAND-087 +gggcgttaattctgtacgaactaggccccatgtaaatgggaggctattcttctgcacgag +gggccgcgaccagctcaacctagagaatctccaaagacgcattcccgttggcctatcaac +tcaggatacccatggtgccccacttagtatagtgangtagtatcgtcacgtacgctggaa +gtagtatctgcagggctcgtctagaatttgcttggccgggcccatacttatttagagcct +tgaaacgaggccgagcgggcatgcccgaccacatatagnaaggtgcgccgtggtcatgta +cggtgaaaagagatgcgtataggtgncagtgggggaggctagggctgagctcatgtct +>RAND-088 +aagatnctgg +tactttatta +gaatnagggt +taggatctca +ctgtttgcaa +acgttctctc +gctttcggat +acaagtcggg +cgtactncaa +ccagatgaca +>RAND-089 +ctaatgccaggacaagttactttngttgttgaaacaaancgtaatatcca +gctagagtacngacttaaaggcactttgccggtgacacttcagatgcatc +gttttatacgtnctttacataagagttggattgaagaagacctcctgtat +acaagcggaccttggcgccattaagactagatgcatcc +>RAND-090 +gataccacatgattncaggcttggcagaacccgtttccttgacatggttggcaggcaact +attcagtcgtnttaatggttgattcaggggtgtatcaggaccctaaactcatcggaatcc +tctagagttaccttctgcgttgcgcgagagttccaatcttgaccagcaaggggagaaccg +agcgggcattcgcacgcttaggtgcgaatactgtgatnttacttgacatc +>RAND-091 +acgcaagaatggaagtgcggtaattatagtatgacacgttcgcggtgacg +aaagatagactcactttagnatcgggagctagcgatcgaggacatctctg +aatttagacaggggccactgctatgacggctaaaatgtgagggggctaga +aattatgctatcat +>RAND-092 +tgccggagtaggtaacgtctacctatatcctggtctccatagttaggcga +gaaccaacccctatttgacgcttacgggcgccgtctcggcaacctgacat +atctactctttttccctgtactctgggctaatgcgggctccacccatgac +gcgtgctcaatggacaactacgcctcccgggcgaagctaaagcctaattt +atgttggaacccttcgcaaccaacattagaatcatgcccgcgcgccattc +agtgcctgcacagagcctctccccggtagggcaagccagcattgagtaaa +agaganatagagtcgaatcgaccgccatcgcgagccagaggtccctaggg +ataggtagggcgagtttctagtccggggtcacaacgctacttgatcgatc +gaacta +>RAND-093 +ttaatttgaaatacngaaattttgatacccccggggcgagcaggcggcgaatgggctgcc +aggctagagtctgtgcgagcacggatcttgctaagctttacaaagtattcagcggctgca +ccgtggccatcgacatggcactcgctcggtgatancctgcagagcgagggaatgaatgca +tcacaaccatggtactaatgtggcctttagagcgcaccccgggattcaacttgctgcact +caggttagccatnatggcatttagtaaaagttaggaatgtttagatgggatcgtttaagc +cgcgatcgatgttccgttaggctaagcgcgatttgcgttaggaggtgcgggctccttcag +gccagttcatccgggcatncactcattttgcactacttgactccactcgcgggnagccgg +ttcgga +>RAND-094 +gaggccgttctggccgccattgttaccggccgcacctcgctcgaaacatcaacagtcttg +gatcgttgactaatcgtccaccctcaacatatatgaaccgccatggcctagacggcttcg +catatattacggaattcccatgacggaggattagccgagttgccgacgtgtgttagcctc +tgagcatacgtctaacgatatcagttatgccacgattgaaggtaaatatcgacgagccac +cac +>RAND-095 +cgttgcgaaaagcaaggttggcgtaacactcagaggtgctaacctctagg +gatccagtcagtgtccatcgggtgacaggaggcc +>RAND-096 +tggtagacat +ggcctnaact +atgtaacaaa +ggttact +>RAND-097 +aatataccgtttcgtattgcgcggatataatgacgcaactcaagcgaataaaaataatca +ttttagtcttccatccatgtcaagaacaccccttaattctttctaaacacccggaatcgt +cattagcctttaagaaaattgcggggctgtgcngagaaatgtctacgcttttcaactaag +agacctagtcctagcagaaagttgaggggggataggtcagacaccagcagtagcatatgc +gaaccctgactcctgacgttagtgaattttacgtgccgattccgactcggattgtcaggg +actccggactactccagcgtgcctaacgatagggaagaatcctagttggtgcatccagcg +tttctcagaatcatcaagccccatatacttgcaaagcgagtgcaacatcattagacaact +aagatacctcttctacacatagaggccgacagatggcagtattatgccaataatatatcg +gtttcgtttagctagc +>RAND-098 +tcgagaatgtctgcccagaacgaaatcttgctggcatcccgcaaacctccatgtgcagga +caccgctttcaatgacccggttccatcggcatcaagtttcctgtgttgactgcctgcacg +ctcggatagtccaggatgcatgcgcctcgggctacagaactgccacctatatgtttagtt +cagcggtgaccgcggatcaaggtaggttcaggccnctaacggggtgtacgacggtatcaa +aatccaactcatctctntatactacggggggtggctgagactgcgctatgctatggtgcc +gttgacgctggctggacagtggggtataggtgcccgatccgggtcnttccctcag +>RAND-099 +tactcggatgaagactcttctgtaaactttagtttaccaatcagaccgga +ctgttttccttgtggctgtaactggcaagctatctatcctgcgcattctc +agacaaagcgagtgtcctttattacgtcgcgaatgggcttggaggatagt +ttcgtgtagactgtactgtcagcacccggtgatgtcgggactcgttgctg +ttatttaaggcggctagaaatactcaggcggtaggacgccactgtattac +cctctaatttaccaaacacagattatcatgatctcgtactcgtcacgana +atagcccgcgcgtccgagaagacttggttttgtacgggtggggctcgcgg +tgtggtagctgccgccagtttcgcgattctacttgcgccagc +>RAND-100 +cccnacgttc +agcacattga +acgacct diff --git a/tests/fasta/test.fa.fai b/tests/fasta/test.fa.fai new file mode 100644 index 0000000..4cfeff6 --- /dev/null +++ b/tests/fasta/test.fa.fai @@ -0,0 +1,100 @@ +RAND-001 176 11 60 62 +RAND-002 355 203 60 61 +RAND-003 51 574 10 11 +RAND-004 403 641 50 51 +RAND-005 137 1063 50 51 +RAND-006 90 1213 50 51 +RAND-007 243 1316 60 62 +RAND-008 366 1579 60 61 +RAND-009 394 1962 50 51 +RAND-010 325 2374 60 61 +RAND-011 414 2715 50 51 +RAND-012 66 3148 10 11 +RAND-013 598 3232 60 62 +RAND-014 366 3860 60 61 +RAND-015 87 4243 10 11 +RAND-016 120 4349 60 61 +RAND-017 573 4481 60 61 +RAND-018 53 5074 10 11 +RAND-019 39 5143 10 11 +RAND-020 514 5196 60 61 +RAND-021 19 5729 10 11 +RAND-022 29 5761 10 12 +RAND-023 411 5807 50 52 +RAND-024 42 6246 10 11 +RAND-025 582 6303 60 61 +RAND-026 95 6906 10 12 +RAND-027 586 7031 60 61 +RAND-028 590 7637 60 61 +RAND-029 84 8248 60 62 +RAND-030 418 8347 60 62 +RAND-031 247 8790 50 52 +RAND-032 509 9058 60 62 +RAND-033 505 9595 60 61 +RAND-034 583 10120 60 62 +RAND-035 148 10734 60 62 +RAND-036 59 10898 50 51 +RAND-037 142 10970 60 62 +RAND-038 439 11128 60 61 +RAND-039 507 11586 60 62 +RAND-040 63 12122 10 12 +RAND-041 370 12209 50 51 +RAND-042 440 12597 60 61 +RAND-043 561 13055 60 61 +RAND-044 289 13636 50 51 +RAND-045 119 13942 60 62 +RAND-046 18 14075 10 11 +RAND-047 16 14106 10 12 +RAND-048 245 14136 60 61 +RAND-049 70 14396 10 11 +RAND-050 36 14483 10 11 +RAND-051 418 14533 50 51 +RAND-052 461 14971 50 52 +RAND-053 47 15462 10 11 +RAND-054 454 15524 60 61 +RAND-055 515 15997 60 62 +RAND-056 446 16540 60 61 +RAND-057 324 17004 60 61 +RAND-058 27 17344 10 11 +RAND-059 297 17385 60 62 +RAND-060 100 17703 10 12 +RAND-061 10 17833 10 11 +RAND-062 13 17854 10 11 +RAND-063 97 17880 10 12 +RAND-064 235 18007 60 61 +RAND-065 80 18257 10 12 +RAND-066 92 18364 10 12 +RAND-067 272 18487 60 62 +RAND-068 93 18779 10 11 +RAND-069 201 18893 50 52 +RAND-070 88 19114 10 11 +RAND-071 42 19221 10 11 +RAND-072 462 19278 50 51 +RAND-073 500 19760 50 51 +RAND-074 215 20280 60 61 +RAND-075 404 20509 60 61 +RAND-076 308 20930 50 51 +RAND-077 71 21255 10 11 +RAND-078 177 21344 60 61 +RAND-079 457 21534 50 51 +RAND-080 298 22011 60 61 +RAND-081 417 22324 50 51 +RAND-082 328 22760 60 61 +RAND-083 389 23104 60 61 +RAND-084 133 23510 60 61 +RAND-085 333 23656 50 51 +RAND-086 259 24006 60 61 +RAND-087 358 24280 60 61 +RAND-088 100 24654 10 11 +RAND-089 188 24774 50 51 +RAND-090 230 24976 60 61 +RAND-091 164 25220 50 51 +RAND-092 406 25398 50 51 +RAND-093 426 25824 60 62 +RAND-094 243 26276 60 61 +RAND-095 84 26534 50 51 +RAND-096 37 26630 10 11 +RAND-097 496 26681 60 61 +RAND-098 355 27196 60 61 +RAND-099 392 27567 50 51 +RAND-100 27 27978 10 12 diff --git a/tests/fasta_test.py b/tests/fasta_test.py new file mode 100644 index 0000000..004a4cb --- /dev/null +++ b/tests/fasta_test.py @@ -0,0 +1,24 @@ +import pathlib + +import pytest + +from tola.fasta.index import index_fasta_bytes + + +def list_fasta_files(): + fasta_dir = pathlib.Path(__file__).parent / "fasta" + for ff in fasta_dir.iterdir(): + if ff.suffix == '.fa': + yield ff + + +@pytest.mark.parametrize("fasta_file", list_fasta_files()) +def test_fai(fasta_file): + fai_file = pathlib.Path(str(fasta_file) + ".fai") + fai_str = fai_file.read_text() + if not fai_file.exists(): + msg = f"Missing expected '.fai' file: {fai_file}" + raise ValueError(msg) + info = index_fasta_bytes(fasta_file) + test_str = "".join(x.fai_row() for x in info) + assert test_str == fai_str diff --git a/tests/fragment_test.py b/tests/fragment_test.py index 3fa7dcb..a4c9306 100644 --- a/tests/fragment_test.py +++ b/tests/fragment_test.py @@ -1,4 +1,5 @@ import pytest + from tola.assembly.fragment import Fragment