Skip to content

Commit

Permalink
#360 Bwt (#411)
Browse files Browse the repository at this point in the history
* basic bitvector

* jacobsons start and refactor to uint for accurate machine words

* confident that jacobson rank is working

* reusing the incoming bitvector instead of copying everyithing for
jacobson rank

* access and bounds checking

* just do uint64 for simplicity. bound checking and access

* bit vector fixes, rsa good enough, wavelet start

* Simple wavelet tree with access

* wavelet fix access, add select, fix rsa bitvector select

* got count working, but had to throw out jacobsons

* rsa fixes and refactors

* bwt locate

* extract

* doc BWT, refactor, and return a possible error during construction

* add TODO about sorting and the nullChar

* bwt examples

* wavelet tree doc

* wavelet tree explanation

* doc and note for waveletTree

* add bwt high level. move wavelet tree's some rsa bv docs

* simplify bitvector, docs for bitvector and rsaBitvector

* Cite Ben Langmead.

---------

Co-authored-by: Willow Carretero Chavez <[email protected]>
Co-authored-by: Timothy Stiles <[email protected]>
  • Loading branch information
3 people authored Jan 18, 2024
1 parent 27c7d28 commit 176dd5b
Show file tree
Hide file tree
Showing 10 changed files with 2,455 additions and 0 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added
- Basic BWT for sub-sequence count and offset for sequence alignment. Only supports exact matches for now.


## [0.30.0] - 2023-12-18
Oops, we weren't keeping a changelog before this tag!
Expand Down
77 changes: 77 additions & 0 deletions bwt/bitvector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package bwt

import (
"fmt"
"math"
)

const wordSize = 64

// bitvector a sequence of 1's and 0's. You can also think
// of this as an array of bits. This allows us to encode
// data in a memory efficient manner.
type bitvector struct {
bits []uint64
numberOfBits int
}

// newBitVector will return an initialized bitvector with
// the specified number of zeroed bits.
func newBitVector(initialNumberOfBits int) bitvector {
capacity := getNumOfBitSetsNeededForNumOfBits(initialNumberOfBits)
bits := make([]uint64, capacity)
return bitvector{
bits: bits,
numberOfBits: initialNumberOfBits,
}
}

// getBitSet gets the while word as some offset from the
// bitvector. Useful if you'd prefer to work with the
// word rather than with individual bits.
func (b bitvector) getBitSet(bitSetPos int) uint64 {
return b.bits[bitSetPos]
}

// getBit returns the value of the bit at a given offset
// True represents 1
// False represents 0
func (b bitvector) getBit(i int) bool {
b.checkBounds(i)

chunkStart := i / wordSize
offset := i % wordSize

return (b.bits[chunkStart] & (uint64(1) << (63 - offset))) != 0
}

// setBit sets the value of the bit at a given offset
// True represents 1
// False represents 0
func (b bitvector) setBit(i int, val bool) {
b.checkBounds(i)

chunkStart := i / wordSize
offset := i % wordSize

if val {
b.bits[chunkStart] |= uint64(1) << (63 - offset)
} else {
b.bits[chunkStart] &= ^(uint64(1) << (63 - offset))
}
}

func (b bitvector) checkBounds(i int) {
if i >= b.len() || i < 0 {
msg := fmt.Sprintf("access of %d is out of bounds for bitvector with length %d", i, b.len())
panic(msg)
}
}

func (b bitvector) len() int {
return b.numberOfBits
}

func getNumOfBitSetsNeededForNumOfBits(n int) int {
return int(math.Ceil(float64(n) / wordSize))
}
119 changes: 119 additions & 0 deletions bwt/bitvector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package bwt

import (
"testing"
)

type GetBitTestCase struct {
position int
expected bool
}

func TestBitVector(t *testing.T) {
initialNumberOfBits := wordSize*10 + 1

bv := newBitVector(initialNumberOfBits)

if bv.len() != initialNumberOfBits {
t.Fatalf("expected len to be %d but got %d", initialNumberOfBits, bv.len())
}

for i := 0; i < initialNumberOfBits; i++ {
bv.setBit(i, true)
}

bv.setBit(3, false)
bv.setBit(11, false)
bv.setBit(13, false)
bv.setBit(23, false)
bv.setBit(24, false)
bv.setBit(25, false)
bv.setBit(42, false)
bv.setBit(63, false)
bv.setBit(64, false)
bv.setBit(255, false)
bv.setBit(256, false)

getBitTestCases := []GetBitTestCase{
{0, true},
{1, true},
{2, true},
{3, false},
{4, true},
{7, true},
{8, true},
{9, true},
{10, true},
{11, false},
{12, true},
{13, false},
{23, false},
{24, false},
{25, false},
{42, false},
{15, true},
{16, true},
{62, true},
{63, false},
{64, false},
// Test past the first word
{65, true},
{72, true},
{79, true},
{80, true},
{255, false},
{256, false},
{511, true},
{512, true},
}

for _, v := range getBitTestCases {
actual := bv.getBit(v.position)
if actual != v.expected {
t.Fatalf("expected %dth bit to be %t but got %t", v.position, v.expected, actual)
}
}
}

func TestBitVectorBoundPanic_GetBit_Lower(t *testing.T) {
defer func() { _ = recover() }()

initialNumberOfBits := wordSize*10 + 1
bv := newBitVector(initialNumberOfBits)
bv.getBit(-1)

t.Fatalf("expected get bit lower bound panic")
}

func TestBitVectorBoundPanic_GetBit_Upper(t *testing.T) {
defer func() { _ = recover() }()
initialNumberOfBits := wordSize*10 + 1
bv := newBitVector(initialNumberOfBits)
bv.getBit(initialNumberOfBits)

t.Fatalf("expected get bit upper bound panic")
}

func TestBitVectorBoundPanic_SetBit_Lower(t *testing.T) {
defer func() {
if r := recover(); r != nil {
return
}
t.Fatalf("expected set bit lower bound panic")
}()
initialNumberOfBits := wordSize*10 + 1
bv := newBitVector(initialNumberOfBits)
bv.setBit(-1, true)
}

func TestBitVectorBoundPanic_SetBit_Upper(t *testing.T) {
defer func() {
if r := recover(); r != nil {
return
}
t.Fatalf("expected set bit upper bound panic")
}()
initialNumberOfBits := wordSize*10 + 1
bv := newBitVector(initialNumberOfBits)
bv.setBit(initialNumberOfBits, true)
}
Loading

0 comments on commit 176dd5b

Please sign in to comment.