#360 Bwt (#411)

* basic bitvector * jacobsons start and refactor to uint for accurate machine words * confident that jacobson rank is working * reusing the incoming bitvector instead of copying everyithing for jacobson rank * access and bounds checking * just do uint64 for simplicity. bound checking and access * bit vector fixes, rsa good enough, wavelet start * Simple wavelet tree with access * wavelet fix access, add select, fix rsa bitvector select * got count working, but had to throw out jacobsons * rsa fixes and refactors * bwt locate * extract * doc BWT, refactor, and return a possible error during construction * add TODO about sorting and the nullChar * bwt examples * wavelet tree doc * wavelet tree explanation * doc and note for waveletTree * add bwt high level. move wavelet tree's some rsa bv docs * simplify bitvector, docs for bitvector and rsaBitvector * Cite Ben Langmead. --------- Co-authored-by: Willow Carretero Chavez <[email protected]> Co-authored-by: Timothy Stiles <[email protected]>
bebop · Jan 18, 2024 · 176dd5b · 176dd5b
1 parent 27c7d28
commit 176dd5b
Show file tree

Hide file tree

Showing 10 changed files with 2,455 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- Basic BWT for sub-sequence count and offset for sequence alignment. Only supports exact matches for now.
+
 
 ## [0.30.0] - 2023-12-18
 Oops, we weren't keeping a changelog before this tag!

diff --git a/bwt/bitvector.go b/bwt/bitvector.go
@@ -0,0 +1,77 @@
+package bwt
+
+import (
+	"fmt"
+	"math"
+)
+
+const wordSize = 64
+
+// bitvector a sequence of 1's and 0's. You can also think
+// of this as an array of bits. This allows us to encode
+// data in a memory efficient manner.
+type bitvector struct {
+	bits         []uint64
+	numberOfBits int
+}
+
+// newBitVector will return an initialized bitvector with
+// the specified number of zeroed bits.
+func newBitVector(initialNumberOfBits int) bitvector {
+	capacity := getNumOfBitSetsNeededForNumOfBits(initialNumberOfBits)
+	bits := make([]uint64, capacity)
+	return bitvector{
+		bits:         bits,
+		numberOfBits: initialNumberOfBits,
+	}
+}
+
+// getBitSet gets the while word as some offset from the
+// bitvector. Useful if you'd prefer to work with the
+// word rather than with individual bits.
+func (b bitvector) getBitSet(bitSetPos int) uint64 {
+	return b.bits[bitSetPos]
+}
+
+// getBit returns the value of the bit at a given offset
+// True represents 1
+// False represents 0
+func (b bitvector) getBit(i int) bool {
+	b.checkBounds(i)
+
+	chunkStart := i / wordSize
+	offset := i % wordSize
+
+	return (b.bits[chunkStart] & (uint64(1) << (63 - offset))) != 0
+}
+
+// setBit sets the value of the bit at a given offset
+// True represents 1
+// False represents 0
+func (b bitvector) setBit(i int, val bool) {
+	b.checkBounds(i)
+
+	chunkStart := i / wordSize
+	offset := i % wordSize
+
+	if val {
+		b.bits[chunkStart] |= uint64(1) << (63 - offset)
+	} else {
+		b.bits[chunkStart] &= ^(uint64(1) << (63 - offset))
+	}
+}
+
+func (b bitvector) checkBounds(i int) {
+	if i >= b.len() || i < 0 {
+		msg := fmt.Sprintf("access of %d is out of bounds for bitvector with length %d", i, b.len())
+		panic(msg)
+	}
+}
+
+func (b bitvector) len() int {
+	return b.numberOfBits
+}
+
+func getNumOfBitSetsNeededForNumOfBits(n int) int {
+	return int(math.Ceil(float64(n) / wordSize))
+}
diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go
@@ -0,0 +1,119 @@
+package bwt
+
+import (
+	"testing"
+)
+
+type GetBitTestCase struct {
+	position int
+	expected bool
+}
+
+func TestBitVector(t *testing.T) {
+	initialNumberOfBits := wordSize*10 + 1
+
+	bv := newBitVector(initialNumberOfBits)
+
+	if bv.len() != initialNumberOfBits {
+		t.Fatalf("expected len to be %d but got %d", initialNumberOfBits, bv.len())
+	}
+
+	for i := 0; i < initialNumberOfBits; i++ {
+		bv.setBit(i, true)
+	}
+
+	bv.setBit(3, false)
+	bv.setBit(11, false)
+	bv.setBit(13, false)
+	bv.setBit(23, false)
+	bv.setBit(24, false)
+	bv.setBit(25, false)
+	bv.setBit(42, false)
+	bv.setBit(63, false)
+	bv.setBit(64, false)
+	bv.setBit(255, false)
+	bv.setBit(256, false)
+
+	getBitTestCases := []GetBitTestCase{
+		{0, true},
+		{1, true},
+		{2, true},
+		{3, false},
+		{4, true},
+		{7, true},
+		{8, true},
+		{9, true},
+		{10, true},
+		{11, false},
+		{12, true},
+		{13, false},
+		{23, false},
+		{24, false},
+		{25, false},
+		{42, false},
+		{15, true},
+		{16, true},
+		{62, true},
+		{63, false},
+		{64, false},
+		// Test past the first word
+		{65, true},
+		{72, true},
+		{79, true},
+		{80, true},
+		{255, false},
+		{256, false},
+		{511, true},
+		{512, true},
+	}
+
+	for _, v := range getBitTestCases {
+		actual := bv.getBit(v.position)
+		if actual != v.expected {
+			t.Fatalf("expected %dth bit to be %t but got %t", v.position, v.expected, actual)
+		}
+	}
+}
+
+func TestBitVectorBoundPanic_GetBit_Lower(t *testing.T) {
+	defer func() { _ = recover() }()
+
+	initialNumberOfBits := wordSize*10 + 1
+	bv := newBitVector(initialNumberOfBits)
+	bv.getBit(-1)
+
+	t.Fatalf("expected get bit lower bound panic")
+}
+
+func TestBitVectorBoundPanic_GetBit_Upper(t *testing.T) {
+	defer func() { _ = recover() }()
+	initialNumberOfBits := wordSize*10 + 1
+	bv := newBitVector(initialNumberOfBits)
+	bv.getBit(initialNumberOfBits)
+
+	t.Fatalf("expected get bit upper bound panic")
+}
+
+func TestBitVectorBoundPanic_SetBit_Lower(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			return
+		}
+		t.Fatalf("expected set bit lower bound panic")
+	}()
+	initialNumberOfBits := wordSize*10 + 1
+	bv := newBitVector(initialNumberOfBits)
+	bv.setBit(-1, true)
+}
+
+func TestBitVectorBoundPanic_SetBit_Upper(t *testing.T) {
+	defer func() {
+		if r := recover(); r != nil {
+			return
+		}
+		t.Fatalf("expected set bit upper bound panic")
+	}()
+	initialNumberOfBits := wordSize*10 + 1
+	bv := newBitVector(initialNumberOfBits)
+	bv.setBit(initialNumberOfBits, true)
+}