From fcaa7d3a80c36d308a290a54df50a1176d1f2f13 Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 10 Nov 2023 12:22:49 -0500 Subject: [PATCH 01/60] progress --- bwt/bwt.go | 142 ++++++++++++++++++++++++++++++++++++++++++++++++ bwt/bwt_test.go | 69 +++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 bwt/bwt.go create mode 100644 bwt/bwt_test.go diff --git a/bwt/bwt.go b/bwt/bwt.go new file mode 100644 index 000000000..ff5477954 --- /dev/null +++ b/bwt/bwt.go @@ -0,0 +1,142 @@ +package bwt + +import ( + "golang.org/x/exp/slices" +) + +const nullChar = "$" + +type window struct { + char byte + start int + end int +} + +func (w window) includes(innerWindow window) bool { + return w.start <= innerWindow.start && innerWindow.end <= w.end +} + +// BWT Burrow Wheeler Transform +// Data structure that compactly represents any sequence of characters and +// allows for sub sequence querying. +type BWT struct { + // First col + f []window + // Last col + l map[byte][]window + // index of the original sequence in the suffix array from BTW construction + indexOfOriginalSequenceFromSuffixArray int +} + +func New(sequence string) BWT { + f, l, idx := build(sequence) + return BWT{ + f: f, + l: l, + indexOfOriginalSequenceFromSuffixArray: idx, + } +} + +func (b BWT) QueryExistence(substr string) bool { + for i := len(substr) - 1; i >= 0; i-- { + win, ok := b.getFWindow(substr[i]) + if !ok { + return false + } + + if i == 0 && ok { + return true + } + + valid := b.charInLExistsInFWindow(win, substr[i-1]) + + if !valid { + return false + } + } + + // shouldn't be getting here + return false +} + +func (b BWT) charInLExistsInFWindow(w window, char byte) bool { + if windows, ok := b.l[char]; ok { + for i := range windows { + if w.includes(windows[i]) { + return true + } + } + } + return false +} + +// Alphabets should be small +func (b BWT) getFWindow(char byte) (w window, ok bool) { + for i := range b.f { + if b.f[i].char == char { + return b.f[i], true + } + } + return window{}, false +} + +func build(s string) (f []window, l map[byte][]window, indexOfOriginalSequenceInSuffixArray int) { + s += nullChar + prefixArray := make([]string, len(s)) + for i := 0; i < len(s); i++ { + prefixArray[i] = s[len(s)-i-1:] + } + + slices.Sort(prefixArray) + + l = make(map[byte][]window) + prevFChar := prefixArray[0][0] + prevFWin := window{char: prevFChar, start: 0} + prevLChar := s[getBWTIndex(len(s), len(prefixArray[0]))] + prevLWin := window{char: prevLChar, start: 0} + for i := 1; i < len(prefixArray); i++ { + currFChar := prefixArray[i][0] + if prevFChar != currFChar { + prevFWin.end = i - 1 + f = append(f, prevFWin) + prevFChar = currFChar + prevFWin = window{char: currFChar, start: i} + } + + currLChar := s[getBWTIndex(len(s), len(prefixArray[i]))] + if prevLChar != currLChar { + prevLWin.end = i - 1 + if _, ok := l[prevLChar]; ok { + l[prevLChar] = append(l[prevLChar], prevLWin) + } else { + l[prevLChar] = []window{prevLWin} + } + prevLChar = currLChar + prevLWin = window{char: currLChar, start: i} + } + if len(s) == len(prefixArray[i]) { + indexOfOriginalSequenceInSuffixArray = i + } + } + prevFWin.end = len(s) - 1 + f = append(f, prevFWin) + prevLWin.end = len(s) - 1 + if _, ok := l[prevLChar]; ok { + l[prevLChar] = append(l[prevLChar], prevLWin) + } else { + l[prevLChar] = []window{prevLWin} + } + if indexOfOriginalSequenceInSuffixArray == 0 { + indexOfOriginalSequenceInSuffixArray = len(s) - 1 + } + + return f, l, indexOfOriginalSequenceInSuffixArray +} + +func getBWTIndex(lenOfSequenceBeingBuilt, lenOfSuffixArrayVisited int) int { + bwtCharIndex := lenOfSequenceBeingBuilt - lenOfSuffixArrayVisited - 1 + if bwtCharIndex == -1 { + bwtCharIndex = lenOfSequenceBeingBuilt - 1 + } + return bwtCharIndex +} diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go new file mode 100644 index 000000000..922829a98 --- /dev/null +++ b/bwt/bwt_test.go @@ -0,0 +1,69 @@ +package bwt + +import ( + "testing" +) + +type QueryTest struct { + seq string + expected bool +} + +func TestQueryBWT(t *testing.T) { + bwt := New("BANANA") + + testTable := []QueryTest{ + {"NANA", true}, + {"ANA", true}, + {"NA", true}, + {"B", true}, + {"N", true}, + {"BA", true}, + {"ANANA", true}, + {"QWERTY", false}, + {"ANANANA", false}, + {"ABCD", false}, + {"ABA", false}, + } + + for _, v := range testTable { + res := bwt.QueryExistence(v.seq) + if res != v.expected { + t.Fatalf("Test=%s ExpectedQueryExistence=%v Received=%v", v.seq, v.expected, res) + } + } +} + +func BenchmarkBWTBuildPower12(b *testing.B) { + base := "!BANANA!" + BaseBenchmarkBWTBuild(base, 12, b) +} + +//go:noinline +func BaseBenchmarkBWTBuild(base string, power int, b *testing.B) { + for n := 0; n < b.N; n++ { + buildBWTForBench(base, power) + } +} + +func buildBWTForBench(base string, power int) BWT { + test := base + for i := 0; i < power; i++ { + test += test + } + + return New(test) +} + +func BenchmarkBWTQueryPower12(b *testing.B) { + base := "!BANANA!" + bwt := buildBWTForBench(base, 12) + BaseBenchmarkBWTQuery(bwt, "ANANABANANA", b) +} + +//go:noinline +func BaseBenchmarkBWTQuery(bwt BWT, seq string, b *testing.B) { + for n := 0; n < b.N; n++ { + bwt.QueryExistence(seq) + } +} From 8813062f3907c318ace000b94871d67e4516323a Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 22 Nov 2023 23:07:39 -0500 Subject: [PATCH 02/60] basic bitvector --- bwt/bitvector.go | 96 ++++++++++++++++++++++++++ bwt/bitvector_test.go | 154 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+) create mode 100644 bwt/bitvector.go create mode 100644 bwt/bitvector_test.go diff --git a/bwt/bitvector.go b/bwt/bitvector.go new file mode 100644 index 000000000..c35afca33 --- /dev/null +++ b/bwt/bitvector.go @@ -0,0 +1,96 @@ +package bwt + +import "math" + +const chunkSize = 8 + +// TODO: document static size and why we differentiate between capacity and number of bits +type bitvector struct { + bits []uint8 + capacityInChunks int + numberOfBits int +} + +func newBitVector(initialNumberOfBits int) bitvector { + capacity := getCapacityNeededForNumberOfBits(initialNumberOfBits) + bits := make([]uint8, capacity) + return bitvector{ + bits: bits, + capacityInChunks: capacity, + numberOfBits: initialNumberOfBits, + } +} + +func (b bitvector) getBit(i int) bool { + if i >= b.len() || i < 0 { + panic("better out of bounds message") + } + + chunkStart := i / chunkSize + offset := i % chunkSize + + return (b.bits[chunkStart] & (uint8(1) << offset)) != 0 +} + +func (b bitvector) setBit(i int, val bool) { + if i >= b.len() || i < 0 { + panic("better out of bounds message") + } + + chunkStart := i / chunkSize + offset := i % chunkSize + + if val { + b.bits[chunkStart] |= uint8(1) << offset + } else { + b.bits[chunkStart] &= ^(uint8(1) << offset) + } +} + +const factor1point2Threshold = 1e9 +const factor1point5Threshold = 1e6 + +func (b *bitvector) push(val bool) { + previousNumberOfBits := b.numberOfBits + nextNumberOfBits := previousNumberOfBits + 1 + if getCapacityNeededForNumberOfBits(nextNumberOfBits) <= b.capacityInChunks { + b.numberOfBits = nextNumberOfBits + b.setBit(previousNumberOfBits, val) + return + } + + var numOfBitsForNextCapacity int + switch true { + case nextNumberOfBits >= factor1point2Threshold: + numOfBitsForNextCapacity = int(math.Ceil(float64(b.numberOfBits) * 1.2)) + break + case nextNumberOfBits >= factor1point5Threshold: + numOfBitsForNextCapacity = int(math.Ceil(float64(b.numberOfBits) * 1.5)) + break + default: + numOfBitsForNextCapacity = b.numberOfBits * 2 + } + + nextCapacity := getCapacityNeededForNumberOfBits(numOfBitsForNextCapacity) + + nextBits := make([]uint8, nextCapacity) + copy(b.bits, nextBits) + b.bits = nextBits + + b.numberOfBits = nextNumberOfBits + b.capacityInChunks = nextCapacity + + b.setBit(previousNumberOfBits, val) +} + +func (b bitvector) len() int { + return b.numberOfBits +} + +func (b bitvector) capacity() int { + return b.capacityInChunks +} + +func getCapacityNeededForNumberOfBits(n int) int { + return int(math.Ceil(float64(n) / 8.0)) +} diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go new file mode 100644 index 000000000..8eafd152c --- /dev/null +++ b/bwt/bitvector_test.go @@ -0,0 +1,154 @@ +package bwt + +import "testing" + +type GetBitTestCase struct { + position int + expected bool +} + +func TestBitVector(t *testing.T) { + initialNumberOfBits := 81 + expectedCapacity := 11 + + bv := newBitVector(initialNumberOfBits) + + if bv.capacity() != expectedCapacity { + t.Fatalf("expected capacity to be %d but got %d", expectedCapacity, bv.capacity()) + } + + if bv.len() != initialNumberOfBits { + t.Fatalf("expected len to be %d but got %d", initialNumberOfBits, bv.len()) + } + + for i := 0; i < initialNumberOfBits; i++ { + bv.setBit(i, true) + } + + bv.setBit(3, false) + bv.setBit(11, false) + bv.setBit(13, false) + bv.setBit(23, false) + bv.setBit(24, false) + bv.setBit(25, false) + bv.setBit(42, false) + + getBitTestCases := []GetBitTestCase{ + {0, true}, + {1, true}, + {2, true}, + {3, false}, + {4, true}, + {7, true}, + {8, true}, + {9, true}, + {10, true}, + {11, false}, + {12, true}, + {13, false}, + {23, false}, + {24, false}, + {25, false}, + {42, false}, + {15, true}, + {16, true}, + {72, true}, + {79, true}, + {80, true}, + } + + for _, v := range getBitTestCases { + actual := bv.getBit(v.position) + if actual != v.expected { + t.Fatalf("expected %dth bit to be %t but got %t", v.position, v.expected, actual) + } + } +} + +func TestBitVectorBoundPanic_GetBit_Lower(t *testing.T) { + defer func() { + if r := recover(); r != nil { + return + } + t.Fatalf("expected get bit lower bound panic") + }() + initialNumberOfBits := 81 + bv := newBitVector(initialNumberOfBits) + bv.getBit(-1) +} + +func TestBitVectorBoundPanic_GetBit_Upper(t *testing.T) { + defer func() { + if r := recover(); r != nil { + return + } + t.Fatalf("expected get bit upper bound panic") + }() + initialNumberOfBits := 81 + bv := newBitVector(initialNumberOfBits) + bv.getBit(81) +} + +func TestBitVectorBoundPanic_SetBit_Lower(t *testing.T) { + defer func() { + if r := recover(); r != nil { + return + } + t.Fatalf("expected set bit lower bound panic") + }() + initialNumberOfBits := 81 + bv := newBitVector(initialNumberOfBits) + bv.setBit(-1, true) +} + +func TestBitVectorBoundPanic_SetBit_Upper(t *testing.T) { + defer func() { + if r := recover(); r != nil { + return + } + t.Fatalf("expected set bit upper bound panic") + }() + initialNumberOfBits := 81 + bv := newBitVector(initialNumberOfBits) + bv.setBit(81, true) +} + +func TestBitVectorPush_NextPushLessThanCapacity_Single(t *testing.T) { + initialNumberOfBits := 81 + bv := newBitVector(initialNumberOfBits) + bv.push(true) + + expectedCapacity := 11 + if bv.capacity() != expectedCapacity { + t.Fatalf("expected capacity to be %d but got %d", expectedCapacity, bv.capacity()) + } + + expectedLength := initialNumberOfBits + 1 + if bv.len() != expectedLength { + t.Fatalf("expected len to be %d but got %d", expectedLength, bv.len()) + } + + if bv.getBit(81) != true { + t.Fatalf("expected 81th bit to be %t but got %t", true, bv.getBit(81)) + } +} + +func TestBitVectorPush_NextPushGreaterThanCapacity_Single(t *testing.T) { + initialNumberOfBits := 88 + bv := newBitVector(initialNumberOfBits) + initialCapacity := bv.capacity() + bv.push(true) + + if bv.capacity() <= initialCapacity { + t.Fatalf("expected capacity to have grown. currently the capacity is %d and was previously %d", bv.capacity(), initialCapacity) + } + + expectedLength := initialNumberOfBits + 1 + if bv.len() != expectedLength { + t.Fatalf("expected len to be %d but got %d", expectedLength, bv.len()) + } + + if bv.getBit(88) != true { + t.Fatalf("expected 88th bit to be %t but got %t", true, bv.getBit(88)) + } +} From e0c2eec253b6c65c9861abc953c92ab518eab950 Mon Sep 17 00:00:00 2001 From: Trenton Date: Thu, 23 Nov 2023 00:57:32 -0500 Subject: [PATCH 03/60] jacobsons start and refactor to uint for accurate machine words --- bwt/bitvector.go | 99 ++++++++++++++++++++++++++++++++++++------- bwt/bitvector_test.go | 45 +++++++++----------- 2 files changed, 103 insertions(+), 41 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index c35afca33..ab083b60c 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -1,19 +1,23 @@ package bwt -import "math" +import ( + "math" + "math/bits" +) -const chunkSize = 8 +// TODO: talk about why this is +const wordSize = bits.UintSize // TODO: document static size and why we differentiate between capacity and number of bits type bitvector struct { - bits []uint8 + bits []uint capacityInChunks int numberOfBits int } func newBitVector(initialNumberOfBits int) bitvector { capacity := getCapacityNeededForNumberOfBits(initialNumberOfBits) - bits := make([]uint8, capacity) + bits := make([]uint, capacity) return bitvector{ bits: bits, capacityInChunks: capacity, @@ -26,10 +30,10 @@ func (b bitvector) getBit(i int) bool { panic("better out of bounds message") } - chunkStart := i / chunkSize - offset := i % chunkSize + chunkStart := i / wordSize + offset := i % wordSize - return (b.bits[chunkStart] & (uint8(1) << offset)) != 0 + return (b.bits[chunkStart] & (uint(1) << offset)) != 0 } func (b bitvector) setBit(i int, val bool) { @@ -37,13 +41,13 @@ func (b bitvector) setBit(i int, val bool) { panic("better out of bounds message") } - chunkStart := i / chunkSize - offset := i % chunkSize + chunkStart := i / wordSize + offset := i % wordSize if val { - b.bits[chunkStart] |= uint8(1) << offset + b.bits[chunkStart] |= uint(1) << offset } else { - b.bits[chunkStart] &= ^(uint8(1) << offset) + b.bits[chunkStart] &= ^(uint(1) << offset) } } @@ -62,18 +66,18 @@ func (b *bitvector) push(val bool) { var numOfBitsForNextCapacity int switch true { case nextNumberOfBits >= factor1point2Threshold: - numOfBitsForNextCapacity = int(math.Ceil(float64(b.numberOfBits) * 1.2)) + numOfBitsForNextCapacity = int(math.Ceil(float64(previousNumberOfBits) * 1.2)) break case nextNumberOfBits >= factor1point5Threshold: - numOfBitsForNextCapacity = int(math.Ceil(float64(b.numberOfBits) * 1.5)) + numOfBitsForNextCapacity = int(math.Ceil(float64(previousNumberOfBits) * 1.5)) break default: - numOfBitsForNextCapacity = b.numberOfBits * 2 + numOfBitsForNextCapacity = previousNumberOfBits * 2 } nextCapacity := getCapacityNeededForNumberOfBits(numOfBitsForNextCapacity) - nextBits := make([]uint8, nextCapacity) + nextBits := make([]uint, nextCapacity) copy(b.bits, nextBits) b.bits = nextBits @@ -92,5 +96,68 @@ func (b bitvector) capacity() int { } func getCapacityNeededForNumberOfBits(n int) int { - return int(math.Ceil(float64(n) / 8.0)) + return int(math.Ceil(float64(n) / wordSize)) } + +// TODO: doc what rsa is, why these DSAs, and why we take in a bit vector +type RSABitVector struct { + jacobsonRank []chunk + clarkSelect []bitvector +} + +type chunk struct { + bits bitvector + onesCumulativeRank int +} + +func newRSABitVector(b bitvector) RSABitVector { + return RSABitVector{} +} + +// TODO: doc how this is ugly and building is probably bad. talk about chunk size +func buildJacobsonRank(inBv bitvector) (int, []chunk) { + // TODO: doc magic numbers and doc that this will always be a natural number + uLen := uint(inBv.len()) + leading1Offset := bits.UintSize - bits.LeadingZeros(uLen) + perfectSquare := int(uint(1) << uint(leading1Offset)) + chunkSize := int(math.Pow(math.Log2(float64(perfectSquare)), 2)) + numChunks := inBv.len() / chunkSize + + // TODO: doc why we have the plus 1 + jacobsonRank := make([]chunk, numChunks+1) + onesCount := 0 + for i := 0; i < numChunks; i++ { + chunkBv := newBitVector(chunkSize) + for j := 0; j < chunkSize; j++ { + val := inBv.getBit(i*wordSize + j) + if val { + onesCount++ + } + chunkBv.setBit(j, val) + } + jacobsonRank[i] = chunk{ + bits: chunkBv, + onesCumulativeRank: onesCount, + } + } + + // TODO: doc the last chunk + lastChunkSize := inBv.len() % int(perfectSquare) + lastChunkBv := newBitVector(lastChunkSize) + for i := 0; i < lastChunkSize; i++ { + val := inBv.getBit(numChunks*wordSize + i) + if val { + onesCount++ + } + lastChunkBv.setBit(i, val) + } + jacobsonRank[len(jacobsonRank)-1] = chunk{ + bits: lastChunkBv, + onesCumulativeRank: onesCount, + } + + return chunkSize, jacobsonRank +} + +// TODO: 15:16 sub chunk impl +// TODO: 17:25 sub chunk calc of rank lower than machine word impl. diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index 8eafd152c..ff34e4374 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -8,7 +8,7 @@ type GetBitTestCase struct { } func TestBitVector(t *testing.T) { - initialNumberOfBits := 81 + initialNumberOfBits := wordSize*10 + 1 expectedCapacity := 11 bv := newBitVector(initialNumberOfBits) @@ -66,27 +66,22 @@ func TestBitVector(t *testing.T) { } func TestBitVectorBoundPanic_GetBit_Lower(t *testing.T) { - defer func() { - if r := recover(); r != nil { - return - } - t.Fatalf("expected get bit lower bound panic") - }() - initialNumberOfBits := 81 + defer func() { _ = recover() }() + + initialNumberOfBits := wordSize*10 + 1 bv := newBitVector(initialNumberOfBits) bv.getBit(-1) + + t.Fatalf("expected get bit lower bound panic") } func TestBitVectorBoundPanic_GetBit_Upper(t *testing.T) { - defer func() { - if r := recover(); r != nil { - return - } - t.Fatalf("expected get bit upper bound panic") - }() - initialNumberOfBits := 81 + defer func() { _ = recover() }() + initialNumberOfBits := wordSize*10 + 1 bv := newBitVector(initialNumberOfBits) - bv.getBit(81) + bv.getBit(initialNumberOfBits) + + t.Fatalf("expected get bit upper bound panic") } func TestBitVectorBoundPanic_SetBit_Lower(t *testing.T) { @@ -96,7 +91,7 @@ func TestBitVectorBoundPanic_SetBit_Lower(t *testing.T) { } t.Fatalf("expected set bit lower bound panic") }() - initialNumberOfBits := 81 + initialNumberOfBits := wordSize*10 + 1 bv := newBitVector(initialNumberOfBits) bv.setBit(-1, true) } @@ -108,13 +103,13 @@ func TestBitVectorBoundPanic_SetBit_Upper(t *testing.T) { } t.Fatalf("expected set bit upper bound panic") }() - initialNumberOfBits := 81 + initialNumberOfBits := wordSize*10 + 1 bv := newBitVector(initialNumberOfBits) - bv.setBit(81, true) + bv.setBit(initialNumberOfBits, true) } func TestBitVectorPush_NextPushLessThanCapacity_Single(t *testing.T) { - initialNumberOfBits := 81 + initialNumberOfBits := wordSize*10 + 1 bv := newBitVector(initialNumberOfBits) bv.push(true) @@ -128,13 +123,13 @@ func TestBitVectorPush_NextPushLessThanCapacity_Single(t *testing.T) { t.Fatalf("expected len to be %d but got %d", expectedLength, bv.len()) } - if bv.getBit(81) != true { - t.Fatalf("expected 81th bit to be %t but got %t", true, bv.getBit(81)) + if bv.getBit(initialNumberOfBits) != true { + t.Fatalf("expected %dth bit to be %t but got %t", initialNumberOfBits, true, bv.getBit(initialNumberOfBits)) } } func TestBitVectorPush_NextPushGreaterThanCapacity_Single(t *testing.T) { - initialNumberOfBits := 88 + initialNumberOfBits := wordSize * 10 bv := newBitVector(initialNumberOfBits) initialCapacity := bv.capacity() bv.push(true) @@ -148,7 +143,7 @@ func TestBitVectorPush_NextPushGreaterThanCapacity_Single(t *testing.T) { t.Fatalf("expected len to be %d but got %d", expectedLength, bv.len()) } - if bv.getBit(88) != true { - t.Fatalf("expected 88th bit to be %t but got %t", true, bv.getBit(88)) + if bv.getBit(initialNumberOfBits) != true { + t.Fatalf("expected %dth bit to be %t but got %t", initialNumberOfBits, true, bv.getBit(initialNumberOfBits)) } } From 2fe342f94c314093bb8084e3b77ac66cd7724983 Mon Sep 17 00:00:00 2001 From: Trenton Date: Sat, 25 Nov 2023 01:02:18 -0500 Subject: [PATCH 04/60] basic rank test --- bwt/bitvector.go | 138 ++++++++++++++++++++++++++---------------- bwt/bitvector_test.go | 103 ++++++++++++++++++++++++++++++- 2 files changed, 188 insertions(+), 53 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index ab083b60c..578aaff11 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -16,7 +16,7 @@ type bitvector struct { } func newBitVector(initialNumberOfBits int) bitvector { - capacity := getCapacityNeededForNumberOfBits(initialNumberOfBits) + capacity := getNumOfBitSetsNeededForNumOfBits(initialNumberOfBits) bits := make([]uint, capacity) return bitvector{ bits: bits, @@ -25,6 +25,14 @@ func newBitVector(initialNumberOfBits int) bitvector { } } +func (b bitvector) getNumOfBitSets() int { + return getNumOfBitSetsNeededForNumOfBits(b.len()) +} + +func (b bitvector) getBitSet(i int) uint { + return b.bits[i] +} + func (b bitvector) getBit(i int) bool { if i >= b.len() || i < 0 { panic("better out of bounds message") @@ -57,7 +65,7 @@ const factor1point5Threshold = 1e6 func (b *bitvector) push(val bool) { previousNumberOfBits := b.numberOfBits nextNumberOfBits := previousNumberOfBits + 1 - if getCapacityNeededForNumberOfBits(nextNumberOfBits) <= b.capacityInChunks { + if getNumOfBitSetsNeededForNumOfBits(nextNumberOfBits) <= b.capacityInChunks { b.numberOfBits = nextNumberOfBits b.setBit(previousNumberOfBits, val) return @@ -75,7 +83,7 @@ func (b *bitvector) push(val bool) { numOfBitsForNextCapacity = previousNumberOfBits * 2 } - nextCapacity := getCapacityNeededForNumberOfBits(numOfBitsForNextCapacity) + nextCapacity := getNumOfBitSetsNeededForNumOfBits(numOfBitsForNextCapacity) nextBits := make([]uint, nextCapacity) copy(b.bits, nextBits) @@ -95,69 +103,95 @@ func (b bitvector) capacity() int { return b.capacityInChunks } -func getCapacityNeededForNumberOfBits(n int) int { +func getNumOfBitSetsNeededForNumOfBits(n int) int { return int(math.Ceil(float64(n) / wordSize)) } // TODO: doc what rsa is, why these DSAs, and why we take in a bit vector type RSABitVector struct { - jacobsonRank []chunk - clarkSelect []bitvector + numOfBits int + jrc []chunk + jrSubChunksPerChunk int + jrBitsPerSubChunk int + clarkSelect []bitvector +} + +func newRSABitVector(b bitvector) RSABitVector { + jacobsonRankChunks, jacobsonRankNumOfSubChunksPerChunk, jacobsonRankNumOfBitsPerSubChunk := buildJacobsonRank(b) + return RSABitVector{ + numOfBits: b.len(), + jrc: jacobsonRankChunks, + jrSubChunksPerChunk: jacobsonRankNumOfSubChunksPerChunk, + jrBitsPerSubChunk: jacobsonRankNumOfBitsPerSubChunk, + clarkSelect: []bitvector{}, + } +} + +// TODO: doc and mention some bit math +func (rsa RSABitVector) rank(val bool, i int) int { + chunkPos := i / (len(rsa.jrc) * rsa.jrSubChunksPerChunk * rsa.jrBitsPerSubChunk) + chunk := rsa.jrc[chunkPos] + + subChunkPos := (i % (len(rsa.jrc) * rsa.jrSubChunksPerChunk * rsa.jrBitsPerSubChunk)) / rsa.jrBitsPerSubChunk + subChunk := chunk.subChunks[subChunkPos] + + bitOffset := i % rsa.jrBitsPerSubChunk + + shiftRightAmount := uint(rsa.jrBitsPerSubChunk - bitOffset) + if val { + remaining := subChunk.bitSet >> shiftRightAmount + return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount(remaining) + } + remaining := ^subChunk.bitSet >> shiftRightAmount + return (chunkPos*rsa.jrSubChunksPerChunk*rsa.jrBitsPerSubChunk - chunk.onesCumulativeRank) + (subChunkPos * rsa.jrBitsPerSubChunk) - subChunk.onesCumulativeRank + bits.OnesCount(remaining) } type chunk struct { - bits bitvector + subChunks []subChunk onesCumulativeRank int } -func newRSABitVector(b bitvector) RSABitVector { - return RSABitVector{} -} - -// TODO: doc how this is ugly and building is probably bad. talk about chunk size -func buildJacobsonRank(inBv bitvector) (int, []chunk) { - // TODO: doc magic numbers and doc that this will always be a natural number - uLen := uint(inBv.len()) - leading1Offset := bits.UintSize - bits.LeadingZeros(uLen) - perfectSquare := int(uint(1) << uint(leading1Offset)) - chunkSize := int(math.Pow(math.Log2(float64(perfectSquare)), 2)) - numChunks := inBv.len() / chunkSize - - // TODO: doc why we have the plus 1 - jacobsonRank := make([]chunk, numChunks+1) - onesCount := 0 - for i := 0; i < numChunks; i++ { - chunkBv := newBitVector(chunkSize) - for j := 0; j < chunkSize; j++ { - val := inBv.getBit(i*wordSize + j) - if val { - onesCount++ - } - chunkBv.setBit(j, val) - } - jacobsonRank[i] = chunk{ - bits: chunkBv, - onesCumulativeRank: onesCount, - } - } +type subChunk struct { + bitSet uint + onesCumulativeRank int +} - // TODO: doc the last chunk - lastChunkSize := inBv.len() % int(perfectSquare) - lastChunkBv := newBitVector(lastChunkSize) - for i := 0; i < lastChunkSize; i++ { - val := inBv.getBit(numChunks*wordSize + i) - if val { - onesCount++ +// TODO: talk about easy to read instead vs perf +func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk int) { + // TODO: talk about why this is probably good enough, improves as n grows, gets worse as n gets smaller, and how this fits into a machine instruction, and how this is "simple" + numOfSubChunksPerChunk = 4 + + chunkCumulativeRank := 0 + subChunkCumulativeRank := 0 + + var currSubChunks []subChunk + for _, bitSet := range inBv.bits { + if len(currSubChunks) == numOfSubChunksPerChunk { + jacobsonRankChunks = append(jacobsonRankChunks, chunk{ + subChunks: currSubChunks, + onesCumulativeRank: chunkCumulativeRank, + }) + + chunkCumulativeRank += subChunkCumulativeRank + + currSubChunks = nil + subChunkCumulativeRank = 0 } - lastChunkBv.setBit(i, val) + currSubChunks = append(currSubChunks, subChunk{ + bitSet: bitSet, + onesCumulativeRank: subChunkCumulativeRank, + }) + + onesCount := bits.OnesCount(bitSet) + subChunkCumulativeRank += onesCount } - jacobsonRank[len(jacobsonRank)-1] = chunk{ - bits: lastChunkBv, - onesCumulativeRank: onesCount, + + if currSubChunks != nil { + jacobsonRankChunks = append(jacobsonRankChunks, chunk{ + subChunks: currSubChunks, + onesCumulativeRank: chunkCumulativeRank, + }) } - return chunkSize, jacobsonRank + return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize } - -// TODO: 15:16 sub chunk impl -// TODO: 17:25 sub chunk calc of rank lower than machine word impl. diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index ff34e4374..35e730696 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -1,6 +1,11 @@ package bwt -import "testing" +import ( + "fmt" + "math/bits" + "strconv" + "testing" +) type GetBitTestCase struct { position int @@ -147,3 +152,99 @@ func TestBitVectorPush_NextPushGreaterThanCapacity_Single(t *testing.T) { t.Fatalf("expected %dth bit to be %t but got %t", initialNumberOfBits, true, bv.getBit(initialNumberOfBits)) } } + +type rsaRankTestCase struct { + val bool + bitPosition int + expectedRank int +} + +func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { + if wordSize != 64 { + t.Skip() + } + + initialNumberOfBits := wordSize * 4 + bv := newBitVector(initialNumberOfBits) + + w0 := uint(0x8000000000000001) + fmt.Println(bits.OnesCount(^w0)) + fmt.Println(strconv.FormatUint(uint64(w0), 2)) + w1 := uint(0xff0f30fffacea80d) + fmt.Println(bits.OnesCount(^w1)) + fmt.Println(strconv.FormatUint(uint64(w1), 2)) + w2 := uint(0x90e0a0e0b0e0cf0c) + fmt.Println(bits.OnesCount(^w2)) + fmt.Println(strconv.FormatUint(uint64(w2), 2)) + w3 := uint(0x3d0f064f7206f717) + fmt.Println(bits.OnesCount(^w3)) + fmt.Println(strconv.FormatUint(uint64(w3), 2)) + + bv.bits = []uint{w0, w1, w2, w3} + + rsa := newRSABitVector(bv) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + {true, 1, 1}, {false, 1, 0}, + {true, 2, 1}, {false, 2, 1}, + {true, 3, 1}, {false, 3, 2}, + {true, 62, 1}, {false, 62, 61}, + {true, 63, 1}, {false, 63, 62}, + + {true, 64, 2}, {false, 64, 62}, + {true, 65, 3}, {false, 65, 62}, + {true, 72, 10}, {false, 72, 62}, + {true, 127, 40}, {false, 127, 87}, + + {true, 128, 41}, {false, 128, 87}, + {true, 129, 42}, {false, 129, 87}, + {true, 130, 42}, {false, 130, 88}, + {true, 131, 42}, {false, 131, 89}, + {true, 132, 43}, {false, 132, 89}, + {true, 133, 43}, {false, 133, 90}, + {true, 159, 51}, {false, 159, 108}, + {true, 160, 51}, {false, 160, 109}, + {true, 161, 52}, {false, 161, 109}, + {true, 162, 52}, {false, 162, 110}, + {true, 163, 53}, {false, 163, 110}, + {true, 164, 54}, {false, 164, 110}, + {true, 165, 54}, {false, 165, 111}, + {true, 176, 57}, {false, 176, 119}, + {true, 177, 58}, {false, 177, 119}, + {true, 178, 59}, {false, 178, 119}, + {true, 179, 59}, {false, 179, 120}, + {true, 180, 59}, {false, 180, 121}, + {true, 183, 62}, {false, 183, 121}, + {true, 184, 63}, {false, 184, 121}, + {true, 185, 63}, {false, 185, 122}, + {true, 186, 63}, {false, 186, 123}, + {true, 187, 63}, {false, 187, 124}, + {true, 188, 63}, {false, 188, 125}, + {true, 189, 64}, {false, 189, 125}, + {true, 190, 65}, {false, 190, 125}, + {true, 191, 65}, {false, 191, 126}, + + {true, 192, 65}, {false, 192, 127}, + {true, 193, 65}, {false, 193, 128}, + {true, 194, 65}, {false, 194, 129}, + {true, 195, 66}, {false, 195, 129}, + {true, 196, 67}, {false, 196, 129}, + {true, 248, 94}, {false, 248, 154}, + {true, 249, 94}, {false, 249, 155}, + {true, 250, 94}, {false, 250, 156}, + {true, 251, 94}, {false, 251, 157}, + {true, 252, 95}, {false, 252, 157}, + {true, 253, 95}, {false, 253, 158}, + {true, 254, 96}, {false, 254, 158}, + {true, 255, 97}, {false, 255, 158}, + } + + for _, tc := range testCases { + rank := rsa.rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} From f73bc77742d2d67efab78d8ffe4b6e35f6541fcc Mon Sep 17 00:00:00 2001 From: Trenton Date: Sat, 25 Nov 2023 21:18:59 -0500 Subject: [PATCH 05/60] confident that jacobson rank is working --- bwt/bitvector.go | 30 +++++------ bwt/bitvector_test.go | 122 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 120 insertions(+), 32 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index 578aaff11..e421e9518 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -109,30 +109,30 @@ func getNumOfBitSetsNeededForNumOfBits(n int) int { // TODO: doc what rsa is, why these DSAs, and why we take in a bit vector type RSABitVector struct { - numOfBits int - jrc []chunk - jrSubChunksPerChunk int - jrBitsPerSubChunk int - clarkSelect []bitvector + numOfBits int + jrc []chunk + jrBitsPerChunk int + jrBitsPerSubChunk int + clarkSelect []bitvector } func newRSABitVector(b bitvector) RSABitVector { - jacobsonRankChunks, jacobsonRankNumOfSubChunksPerChunk, jacobsonRankNumOfBitsPerSubChunk := buildJacobsonRank(b) + jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk := buildJacobsonRank(b) return RSABitVector{ - numOfBits: b.len(), - jrc: jacobsonRankChunks, - jrSubChunksPerChunk: jacobsonRankNumOfSubChunksPerChunk, - jrBitsPerSubChunk: jacobsonRankNumOfBitsPerSubChunk, - clarkSelect: []bitvector{}, + numOfBits: b.len(), + jrc: jacobsonRankChunks, + jrBitsPerChunk: jrBitsPerChunk, + jrBitsPerSubChunk: jrBitsPerSubChunk, + clarkSelect: []bitvector{}, } } // TODO: doc and mention some bit math func (rsa RSABitVector) rank(val bool, i int) int { - chunkPos := i / (len(rsa.jrc) * rsa.jrSubChunksPerChunk * rsa.jrBitsPerSubChunk) + chunkPos := (i / rsa.jrBitsPerChunk) chunk := rsa.jrc[chunkPos] - subChunkPos := (i % (len(rsa.jrc) * rsa.jrSubChunksPerChunk * rsa.jrBitsPerSubChunk)) / rsa.jrBitsPerSubChunk + subChunkPos := (i % rsa.jrBitsPerChunk) / rsa.jrBitsPerSubChunk subChunk := chunk.subChunks[subChunkPos] bitOffset := i % rsa.jrBitsPerSubChunk @@ -143,7 +143,7 @@ func (rsa RSABitVector) rank(val bool, i int) int { return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount(remaining) } remaining := ^subChunk.bitSet >> shiftRightAmount - return (chunkPos*rsa.jrSubChunksPerChunk*rsa.jrBitsPerSubChunk - chunk.onesCumulativeRank) + (subChunkPos * rsa.jrBitsPerSubChunk) - subChunk.onesCumulativeRank + bits.OnesCount(remaining) + return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount(remaining) } type chunk struct { @@ -193,5 +193,5 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun }) } - return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize + return jacobsonRankChunks, numOfSubChunksPerChunk * wordSize, wordSize } diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index 35e730696..45a184a7c 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -1,9 +1,6 @@ package bwt import ( - "fmt" - "math/bits" - "strconv" "testing" ) @@ -159,6 +156,41 @@ type rsaRankTestCase struct { expectedRank int } +func TestRSARank_wordSize64_singlePartialChunk(t *testing.T) { + if wordSize != 64 { + t.Skip() + } + + bitsToTruncate := 22 + initialNumberOfBits := wordSize*2 - bitsToTruncate + bv := newBitVector(initialNumberOfBits) + + bv.bits = []uint{ + uint(0xffffffff00000000), + uint(0x00000000ffc00000), + } + + rsa := newRSABitVector(bv) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + + {true, 64, 32}, {false, 64, 32}, + + {true, 96, 32}, {false, 96, 64}, + + {true, 106, 42}, {false, 106, 64}, + } + + for _, tc := range testCases { + rank := rsa.rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} + func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { if wordSize != 64 { t.Skip() @@ -167,20 +199,12 @@ func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { initialNumberOfBits := wordSize * 4 bv := newBitVector(initialNumberOfBits) - w0 := uint(0x8000000000000001) - fmt.Println(bits.OnesCount(^w0)) - fmt.Println(strconv.FormatUint(uint64(w0), 2)) - w1 := uint(0xff0f30fffacea80d) - fmt.Println(bits.OnesCount(^w1)) - fmt.Println(strconv.FormatUint(uint64(w1), 2)) - w2 := uint(0x90e0a0e0b0e0cf0c) - fmt.Println(bits.OnesCount(^w2)) - fmt.Println(strconv.FormatUint(uint64(w2), 2)) - w3 := uint(0x3d0f064f7206f717) - fmt.Println(bits.OnesCount(^w3)) - fmt.Println(strconv.FormatUint(uint64(w3), 2)) - - bv.bits = []uint{w0, w1, w2, w3} + bv.bits = []uint{ + uint(0x8000000000000001), + uint(0xff0f30fffacea80d), + uint(0x90e0a0e0b0e0cf0c), + uint(0x3d0f064f7206f717), + } rsa := newRSABitVector(bv) @@ -248,3 +272,67 @@ func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { } } + +func TestRSARank_wordSize64_multipleChunks(t *testing.T) { + if wordSize != 64 { + t.Skip() + } + + numBitsToTruncate := 17 + initialNumberOfBits := wordSize*15 - numBitsToTruncate + bv := newBitVector(initialNumberOfBits) + + bv.bits = []uint{ + uint(0x0000000000000000), + uint(0xffffffffffffffff), + uint(0x0000000000000000), + uint(0xffffffffffffffff), + + uint(0xffffffffffffffff), + uint(0x0000000000000000), + uint(0xffffffffffffffff), + uint(0x0000000000000000), + + uint(0x0000000000000000), + uint(0xffffffffffffffff), + uint(0x0000000000000000), + uint(0xffffffffffffffff), + + uint(0xffffffffffffffff), + uint(0x0000000000000000), + uint(0xffffffffffffffff), // this should end up getting truncated + } + + rsa := newRSABitVector(bv) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + + {true, 64, 0}, {false, 64, 64}, + {true, 128, 64}, {false, 128, 64}, + {true, 192, 64}, {false, 192, 128}, + {true, 256, 128}, {false, 256, 128}, + + {true, 320, 192}, {false, 256, 128}, + {true, 384, 192}, {false, 384, 192}, + {true, 448, 256}, {false, 448, 192}, + {true, 512, 256}, {false, 512, 256}, + + {true, 576, 256}, {false, 576, 320}, + {true, 640, 320}, {false, 640, 320}, + {true, 704, 320}, {false, 704, 384}, + {true, 768, 384}, {false, 768, 384}, + + {true, 832, 448}, {false, 832, 384}, + {true, 896, 448}, {false, 896, 448}, + {true, 896 + wordSize - numBitsToTruncate, 448 + wordSize - numBitsToTruncate}, {false, 896 + wordSize - numBitsToTruncate, 448}, + } + + for _, tc := range testCases { + rank := rsa.rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} From 4e757866a5c1542b731d7c392e40e0b9ec7d52b1 Mon Sep 17 00:00:00 2001 From: Trenton Date: Sat, 25 Nov 2023 21:35:03 -0500 Subject: [PATCH 06/60] reusing the incoming bitvector instead of copying everyithing for jacobson rank --- bwt/bitvector.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index e421e9518..aea24357a 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -108,22 +108,22 @@ func getNumOfBitSetsNeededForNumOfBits(n int) int { } // TODO: doc what rsa is, why these DSAs, and why we take in a bit vector +// TODO: clarks select +// TODO: access type RSABitVector struct { - numOfBits int + bv bitvector jrc []chunk jrBitsPerChunk int jrBitsPerSubChunk int - clarkSelect []bitvector } -func newRSABitVector(b bitvector) RSABitVector { - jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk := buildJacobsonRank(b) +func newRSABitVector(bv bitvector) RSABitVector { + jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk := buildJacobsonRank(bv) return RSABitVector{ - numOfBits: b.len(), + bv: bv, jrc: jacobsonRankChunks, jrBitsPerChunk: jrBitsPerChunk, jrBitsPerSubChunk: jrBitsPerSubChunk, - clarkSelect: []bitvector{}, } } @@ -137,12 +137,14 @@ func (rsa RSABitVector) rank(val bool, i int) int { bitOffset := i % rsa.jrBitsPerSubChunk + bitSet := rsa.bv.getBitSet(chunkPos*len(rsa.jrc) + subChunkPos) + shiftRightAmount := uint(rsa.jrBitsPerSubChunk - bitOffset) if val { - remaining := subChunk.bitSet >> shiftRightAmount + remaining := bitSet >> shiftRightAmount return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount(remaining) } - remaining := ^subChunk.bitSet >> shiftRightAmount + remaining := ^bitSet >> shiftRightAmount return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount(remaining) } @@ -152,7 +154,6 @@ type chunk struct { } type subChunk struct { - bitSet uint onesCumulativeRank int } @@ -165,7 +166,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun subChunkCumulativeRank := 0 var currSubChunks []subChunk - for _, bitSet := range inBv.bits { + for i := range inBv.bits { if len(currSubChunks) == numOfSubChunksPerChunk { jacobsonRankChunks = append(jacobsonRankChunks, chunk{ subChunks: currSubChunks, @@ -178,11 +179,10 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun subChunkCumulativeRank = 0 } currSubChunks = append(currSubChunks, subChunk{ - bitSet: bitSet, onesCumulativeRank: subChunkCumulativeRank, }) - onesCount := bits.OnesCount(bitSet) + onesCount := bits.OnesCount(inBv.getBitSet(i)) subChunkCumulativeRank += onesCount } From b7ddaeff97e07625f9c2f195a1250ffbe7bd0ecf Mon Sep 17 00:00:00 2001 From: Trenton Date: Sat, 25 Nov 2023 23:15:03 -0500 Subject: [PATCH 07/60] access and bounds checking --- bwt/bitvector.go | 35 +++++++++++++++++++++++------------ bwt/bitvector_test.go | 8 ++++---- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index aea24357a..fbbe3aea7 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -34,9 +34,7 @@ func (b bitvector) getBitSet(i int) uint { } func (b bitvector) getBit(i int) bool { - if i >= b.len() || i < 0 { - panic("better out of bounds message") - } + b.checkBounds(i) chunkStart := i / wordSize offset := i % wordSize @@ -45,9 +43,7 @@ func (b bitvector) getBit(i int) bool { } func (b bitvector) setBit(i int, val bool) { - if i >= b.len() || i < 0 { - panic("better out of bounds message") - } + b.checkBounds(i) chunkStart := i / wordSize offset := i % wordSize @@ -58,6 +54,11 @@ func (b bitvector) setBit(i int, val bool) { b.bits[chunkStart] &= ^(uint(1) << offset) } } +func (b bitvector) checkBounds(i int) { + if i >= b.len() || i < 0 { + panic("better out of bounds message") + } +} const factor1point2Threshold = 1e9 const factor1point5Threshold = 1e6 @@ -109,17 +110,17 @@ func getNumOfBitSetsNeededForNumOfBits(n int) int { // TODO: doc what rsa is, why these DSAs, and why we take in a bit vector // TODO: clarks select -// TODO: access -type RSABitVector struct { +type rsaBitVector struct { bv bitvector jrc []chunk jrBitsPerChunk int jrBitsPerSubChunk int } -func newRSABitVector(bv bitvector) RSABitVector { +// TODO: talk about why bv should never be modidifed after building the RSA bit vector +func newRSABitVector(bv bitvector) rsaBitVector { jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk := buildJacobsonRank(bv) - return RSABitVector{ + return rsaBitVector{ bv: bv, jrc: jacobsonRankChunks, jrBitsPerChunk: jrBitsPerChunk, @@ -127,8 +128,9 @@ func newRSABitVector(bv bitvector) RSABitVector { } } -// TODO: doc and mention some bit math -func (rsa RSABitVector) rank(val bool, i int) int { +func (rsa rsaBitVector) Rank(val bool, i int) int { + rsa.bv.checkBounds(i) + chunkPos := (i / rsa.jrBitsPerChunk) chunk := rsa.jrc[chunkPos] @@ -145,9 +147,18 @@ func (rsa RSABitVector) rank(val bool, i int) int { return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount(remaining) } remaining := ^bitSet >> shiftRightAmount + // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount(remaining) } +func (rsa rsaBitVector) Select(i int) bool { + return true +} + +func (rsa rsaBitVector) Access(i int) bool { + return rsa.bv.getBit(i) +} + type chunk struct { subChunks []subChunk onesCumulativeRank int diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index 45a184a7c..ddb2fb73f 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -179,11 +179,11 @@ func TestRSARank_wordSize64_singlePartialChunk(t *testing.T) { {true, 96, 32}, {false, 96, 64}, - {true, 106, 42}, {false, 106, 64}, + {true, 105, 41}, {false, 105, 64}, } for _, tc := range testCases { - rank := rsa.rank(tc.val, tc.bitPosition) + rank := rsa.Rank(tc.val, tc.bitPosition) if rank != tc.expectedRank { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } @@ -265,7 +265,7 @@ func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { } for _, tc := range testCases { - rank := rsa.rank(tc.val, tc.bitPosition) + rank := rsa.Rank(tc.val, tc.bitPosition) if rank != tc.expectedRank { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } @@ -329,7 +329,7 @@ func TestRSARank_wordSize64_multipleChunks(t *testing.T) { } for _, tc := range testCases { - rank := rsa.rank(tc.val, tc.bitPosition) + rank := rsa.Rank(tc.val, tc.bitPosition) if rank != tc.expectedRank { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } From 04065209f59ba979e47324afe7628e59578bf7b9 Mon Sep 17 00:00:00 2001 From: Trenton Date: Sat, 25 Nov 2023 23:19:50 -0500 Subject: [PATCH 08/60] just do uint64 for simplicity. bound checking and access --- bwt/bitvector.go | 26 ++++++++--------- bwt/bitvector_test.go | 66 ++++++++++++++++++++----------------------- 2 files changed, 44 insertions(+), 48 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index fbbe3aea7..d50a51120 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -5,19 +5,19 @@ import ( "math/bits" ) -// TODO: talk about why this is -const wordSize = bits.UintSize +// TODO: talk about why this is and why we approximate things to make them "simple enough" +const wordSize = 64 // TODO: document static size and why we differentiate between capacity and number of bits type bitvector struct { - bits []uint + bits []uint64 capacityInChunks int numberOfBits int } func newBitVector(initialNumberOfBits int) bitvector { capacity := getNumOfBitSetsNeededForNumOfBits(initialNumberOfBits) - bits := make([]uint, capacity) + bits := make([]uint64, capacity) return bitvector{ bits: bits, capacityInChunks: capacity, @@ -29,7 +29,7 @@ func (b bitvector) getNumOfBitSets() int { return getNumOfBitSetsNeededForNumOfBits(b.len()) } -func (b bitvector) getBitSet(i int) uint { +func (b bitvector) getBitSet(i int) uint64 { return b.bits[i] } @@ -39,7 +39,7 @@ func (b bitvector) getBit(i int) bool { chunkStart := i / wordSize offset := i % wordSize - return (b.bits[chunkStart] & (uint(1) << offset)) != 0 + return (b.bits[chunkStart] & (uint64(1) << offset)) != 0 } func (b bitvector) setBit(i int, val bool) { @@ -49,9 +49,9 @@ func (b bitvector) setBit(i int, val bool) { offset := i % wordSize if val { - b.bits[chunkStart] |= uint(1) << offset + b.bits[chunkStart] |= uint64(1) << offset } else { - b.bits[chunkStart] &= ^(uint(1) << offset) + b.bits[chunkStart] &= ^(uint64(1) << offset) } } func (b bitvector) checkBounds(i int) { @@ -86,7 +86,7 @@ func (b *bitvector) push(val bool) { nextCapacity := getNumOfBitSetsNeededForNumOfBits(numOfBitsForNextCapacity) - nextBits := make([]uint, nextCapacity) + nextBits := make([]uint64, nextCapacity) copy(b.bits, nextBits) b.bits = nextBits @@ -141,14 +141,14 @@ func (rsa rsaBitVector) Rank(val bool, i int) int { bitSet := rsa.bv.getBitSet(chunkPos*len(rsa.jrc) + subChunkPos) - shiftRightAmount := uint(rsa.jrBitsPerSubChunk - bitOffset) + shiftRightAmount := uint64(rsa.jrBitsPerSubChunk - bitOffset) if val { remaining := bitSet >> shiftRightAmount - return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount(remaining) + return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount64(remaining) } remaining := ^bitSet >> shiftRightAmount // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 - return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount(remaining) + return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) } func (rsa rsaBitVector) Select(i int) bool { @@ -193,7 +193,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun onesCumulativeRank: subChunkCumulativeRank, }) - onesCount := bits.OnesCount(inBv.getBitSet(i)) + onesCount := bits.OnesCount64(inBv.getBitSet(i)) subChunkCumulativeRank += onesCount } diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index ddb2fb73f..ebc5e417b 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -156,7 +156,7 @@ type rsaRankTestCase struct { expectedRank int } -func TestRSARank_wordSize64_singlePartialChunk(t *testing.T) { +func TestRSARank_singlePartialChunk(t *testing.T) { if wordSize != 64 { t.Skip() } @@ -165,9 +165,9 @@ func TestRSARank_wordSize64_singlePartialChunk(t *testing.T) { initialNumberOfBits := wordSize*2 - bitsToTruncate bv := newBitVector(initialNumberOfBits) - bv.bits = []uint{ - uint(0xffffffff00000000), - uint(0x00000000ffc00000), + bv.bits = []uint64{ + 0xffffffff00000000, + 0x00000000ffc00000, } rsa := newRSABitVector(bv) @@ -191,7 +191,7 @@ func TestRSARank_wordSize64_singlePartialChunk(t *testing.T) { } -func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { +func TestRSARank_singleCompleteChunk(t *testing.T) { if wordSize != 64 { t.Skip() } @@ -199,11 +199,11 @@ func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { initialNumberOfBits := wordSize * 4 bv := newBitVector(initialNumberOfBits) - bv.bits = []uint{ - uint(0x8000000000000001), - uint(0xff0f30fffacea80d), - uint(0x90e0a0e0b0e0cf0c), - uint(0x3d0f064f7206f717), + bv.bits = []uint64{ + 0x8000000000000001, + 0xff0f30fffacea80d, + 0x90e0a0e0b0e0cf0c, + 0x3d0f064f7206f717, } rsa := newRSABitVector(bv) @@ -273,34 +273,30 @@ func TestRSARank_wordSize64_singleCompleteChunk(t *testing.T) { } -func TestRSARank_wordSize64_multipleChunks(t *testing.T) { - if wordSize != 64 { - t.Skip() - } - +func TestRSARank_multipleChunks(t *testing.T) { numBitsToTruncate := 17 initialNumberOfBits := wordSize*15 - numBitsToTruncate bv := newBitVector(initialNumberOfBits) - bv.bits = []uint{ - uint(0x0000000000000000), - uint(0xffffffffffffffff), - uint(0x0000000000000000), - uint(0xffffffffffffffff), - - uint(0xffffffffffffffff), - uint(0x0000000000000000), - uint(0xffffffffffffffff), - uint(0x0000000000000000), - - uint(0x0000000000000000), - uint(0xffffffffffffffff), - uint(0x0000000000000000), - uint(0xffffffffffffffff), - - uint(0xffffffffffffffff), - uint(0x0000000000000000), - uint(0xffffffffffffffff), // this should end up getting truncated + bv.bits = []uint64{ + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, // this should end up getting truncated } rsa := newRSABitVector(bv) @@ -325,7 +321,7 @@ func TestRSARank_wordSize64_multipleChunks(t *testing.T) { {true, 832, 448}, {false, 832, 384}, {true, 896, 448}, {false, 896, 448}, - {true, 896 + wordSize - numBitsToTruncate, 448 + wordSize - numBitsToTruncate}, {false, 896 + wordSize - numBitsToTruncate, 448}, + {true, 896 + wordSize - numBitsToTruncate - 1, 448 + wordSize - numBitsToTruncate - 1}, {false, 896 + wordSize - numBitsToTruncate - 1, 448}, } for _, tc := range testCases { From 2ad117e617ec941418f3664cdb5c9f0d8b0a0e7c Mon Sep 17 00:00:00 2001 From: Trenton Date: Sat, 2 Dec 2023 01:49:32 -0500 Subject: [PATCH 09/60] bit vector fixes, rsa good enough, wavelet start --- bwt/bitvector.go | 107 +------------- bwt/bitvector_test.go | 189 +------------------------ bwt/rsa_bitvector.go | 136 ++++++++++++++++++ bwt/rsa_bitvector_test.go | 283 ++++++++++++++++++++++++++++++++++++++ bwt/wavelet.go | 164 ++++++++++++++++++++++ bwt/wavelet_test.go | 1 + 6 files changed, 593 insertions(+), 287 deletions(-) create mode 100644 bwt/rsa_bitvector.go create mode 100644 bwt/rsa_bitvector_test.go create mode 100644 bwt/wavelet.go create mode 100644 bwt/wavelet_test.go diff --git a/bwt/bitvector.go b/bwt/bitvector.go index d50a51120..de6f7ae7f 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -2,13 +2,11 @@ package bwt import ( "math" - "math/bits" ) // TODO: talk about why this is and why we approximate things to make them "simple enough" const wordSize = 64 -// TODO: document static size and why we differentiate between capacity and number of bits type bitvector struct { bits []uint64 capacityInChunks int @@ -39,7 +37,7 @@ func (b bitvector) getBit(i int) bool { chunkStart := i / wordSize offset := i % wordSize - return (b.bits[chunkStart] & (uint64(1) << offset)) != 0 + return (b.bits[chunkStart] & (uint64(1) << (63 - offset))) != 0 } func (b bitvector) setBit(i int, val bool) { @@ -49,9 +47,9 @@ func (b bitvector) setBit(i int, val bool) { offset := i % wordSize if val { - b.bits[chunkStart] |= uint64(1) << offset + b.bits[chunkStart] |= uint64(1) << (63 - offset) } else { - b.bits[chunkStart] &= ^(uint64(1) << offset) + b.bits[chunkStart] &= ^(uint64(1) << (63 - offset)) } } func (b bitvector) checkBounds(i int) { @@ -107,102 +105,3 @@ func (b bitvector) capacity() int { func getNumOfBitSetsNeededForNumOfBits(n int) int { return int(math.Ceil(float64(n) / wordSize)) } - -// TODO: doc what rsa is, why these DSAs, and why we take in a bit vector -// TODO: clarks select -type rsaBitVector struct { - bv bitvector - jrc []chunk - jrBitsPerChunk int - jrBitsPerSubChunk int -} - -// TODO: talk about why bv should never be modidifed after building the RSA bit vector -func newRSABitVector(bv bitvector) rsaBitVector { - jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk := buildJacobsonRank(bv) - return rsaBitVector{ - bv: bv, - jrc: jacobsonRankChunks, - jrBitsPerChunk: jrBitsPerChunk, - jrBitsPerSubChunk: jrBitsPerSubChunk, - } -} - -func (rsa rsaBitVector) Rank(val bool, i int) int { - rsa.bv.checkBounds(i) - - chunkPos := (i / rsa.jrBitsPerChunk) - chunk := rsa.jrc[chunkPos] - - subChunkPos := (i % rsa.jrBitsPerChunk) / rsa.jrBitsPerSubChunk - subChunk := chunk.subChunks[subChunkPos] - - bitOffset := i % rsa.jrBitsPerSubChunk - - bitSet := rsa.bv.getBitSet(chunkPos*len(rsa.jrc) + subChunkPos) - - shiftRightAmount := uint64(rsa.jrBitsPerSubChunk - bitOffset) - if val { - remaining := bitSet >> shiftRightAmount - return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount64(remaining) - } - remaining := ^bitSet >> shiftRightAmount - // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 - return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) -} - -func (rsa rsaBitVector) Select(i int) bool { - return true -} - -func (rsa rsaBitVector) Access(i int) bool { - return rsa.bv.getBit(i) -} - -type chunk struct { - subChunks []subChunk - onesCumulativeRank int -} - -type subChunk struct { - onesCumulativeRank int -} - -// TODO: talk about easy to read instead vs perf -func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk int) { - // TODO: talk about why this is probably good enough, improves as n grows, gets worse as n gets smaller, and how this fits into a machine instruction, and how this is "simple" - numOfSubChunksPerChunk = 4 - - chunkCumulativeRank := 0 - subChunkCumulativeRank := 0 - - var currSubChunks []subChunk - for i := range inBv.bits { - if len(currSubChunks) == numOfSubChunksPerChunk { - jacobsonRankChunks = append(jacobsonRankChunks, chunk{ - subChunks: currSubChunks, - onesCumulativeRank: chunkCumulativeRank, - }) - - chunkCumulativeRank += subChunkCumulativeRank - - currSubChunks = nil - subChunkCumulativeRank = 0 - } - currSubChunks = append(currSubChunks, subChunk{ - onesCumulativeRank: subChunkCumulativeRank, - }) - - onesCount := bits.OnesCount64(inBv.getBitSet(i)) - subChunkCumulativeRank += onesCount - } - - if currSubChunks != nil { - jacobsonRankChunks = append(jacobsonRankChunks, chunk{ - subChunks: currSubChunks, - onesCumulativeRank: chunkCumulativeRank, - }) - } - - return jacobsonRankChunks, numOfSubChunksPerChunk * wordSize, wordSize -} diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index ebc5e417b..68c97ac92 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -34,6 +34,8 @@ func TestBitVector(t *testing.T) { bv.setBit(24, false) bv.setBit(25, false) bv.setBit(42, false) + bv.setBit(63, false) + bv.setBit(64, false) getBitTestCases := []GetBitTestCase{ {0, true}, @@ -54,6 +56,10 @@ func TestBitVector(t *testing.T) { {42, false}, {15, true}, {16, true}, + {62, true}, + {63, false}, + {64, false}, + {65, true}, {72, true}, {79, true}, {80, true}, @@ -149,186 +155,3 @@ func TestBitVectorPush_NextPushGreaterThanCapacity_Single(t *testing.T) { t.Fatalf("expected %dth bit to be %t but got %t", initialNumberOfBits, true, bv.getBit(initialNumberOfBits)) } } - -type rsaRankTestCase struct { - val bool - bitPosition int - expectedRank int -} - -func TestRSARank_singlePartialChunk(t *testing.T) { - if wordSize != 64 { - t.Skip() - } - - bitsToTruncate := 22 - initialNumberOfBits := wordSize*2 - bitsToTruncate - bv := newBitVector(initialNumberOfBits) - - bv.bits = []uint64{ - 0xffffffff00000000, - 0x00000000ffc00000, - } - - rsa := newRSABitVector(bv) - - testCases := []rsaRankTestCase{ - {true, 0, 0}, {false, 0, 0}, - - {true, 64, 32}, {false, 64, 32}, - - {true, 96, 32}, {false, 96, 64}, - - {true, 105, 41}, {false, 105, 64}, - } - - for _, tc := range testCases { - rank := rsa.Rank(tc.val, tc.bitPosition) - if rank != tc.expectedRank { - t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) - } - } - -} - -func TestRSARank_singleCompleteChunk(t *testing.T) { - if wordSize != 64 { - t.Skip() - } - - initialNumberOfBits := wordSize * 4 - bv := newBitVector(initialNumberOfBits) - - bv.bits = []uint64{ - 0x8000000000000001, - 0xff0f30fffacea80d, - 0x90e0a0e0b0e0cf0c, - 0x3d0f064f7206f717, - } - - rsa := newRSABitVector(bv) - - testCases := []rsaRankTestCase{ - {true, 0, 0}, {false, 0, 0}, - {true, 1, 1}, {false, 1, 0}, - {true, 2, 1}, {false, 2, 1}, - {true, 3, 1}, {false, 3, 2}, - {true, 62, 1}, {false, 62, 61}, - {true, 63, 1}, {false, 63, 62}, - - {true, 64, 2}, {false, 64, 62}, - {true, 65, 3}, {false, 65, 62}, - {true, 72, 10}, {false, 72, 62}, - {true, 127, 40}, {false, 127, 87}, - - {true, 128, 41}, {false, 128, 87}, - {true, 129, 42}, {false, 129, 87}, - {true, 130, 42}, {false, 130, 88}, - {true, 131, 42}, {false, 131, 89}, - {true, 132, 43}, {false, 132, 89}, - {true, 133, 43}, {false, 133, 90}, - {true, 159, 51}, {false, 159, 108}, - {true, 160, 51}, {false, 160, 109}, - {true, 161, 52}, {false, 161, 109}, - {true, 162, 52}, {false, 162, 110}, - {true, 163, 53}, {false, 163, 110}, - {true, 164, 54}, {false, 164, 110}, - {true, 165, 54}, {false, 165, 111}, - {true, 176, 57}, {false, 176, 119}, - {true, 177, 58}, {false, 177, 119}, - {true, 178, 59}, {false, 178, 119}, - {true, 179, 59}, {false, 179, 120}, - {true, 180, 59}, {false, 180, 121}, - {true, 183, 62}, {false, 183, 121}, - {true, 184, 63}, {false, 184, 121}, - {true, 185, 63}, {false, 185, 122}, - {true, 186, 63}, {false, 186, 123}, - {true, 187, 63}, {false, 187, 124}, - {true, 188, 63}, {false, 188, 125}, - {true, 189, 64}, {false, 189, 125}, - {true, 190, 65}, {false, 190, 125}, - {true, 191, 65}, {false, 191, 126}, - - {true, 192, 65}, {false, 192, 127}, - {true, 193, 65}, {false, 193, 128}, - {true, 194, 65}, {false, 194, 129}, - {true, 195, 66}, {false, 195, 129}, - {true, 196, 67}, {false, 196, 129}, - {true, 248, 94}, {false, 248, 154}, - {true, 249, 94}, {false, 249, 155}, - {true, 250, 94}, {false, 250, 156}, - {true, 251, 94}, {false, 251, 157}, - {true, 252, 95}, {false, 252, 157}, - {true, 253, 95}, {false, 253, 158}, - {true, 254, 96}, {false, 254, 158}, - {true, 255, 97}, {false, 255, 158}, - } - - for _, tc := range testCases { - rank := rsa.Rank(tc.val, tc.bitPosition) - if rank != tc.expectedRank { - t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) - } - } - -} - -func TestRSARank_multipleChunks(t *testing.T) { - numBitsToTruncate := 17 - initialNumberOfBits := wordSize*15 - numBitsToTruncate - bv := newBitVector(initialNumberOfBits) - - bv.bits = []uint64{ - 0x0000000000000000, - 0xffffffffffffffff, - 0x0000000000000000, - 0xffffffffffffffff, - - 0xffffffffffffffff, - 0x0000000000000000, - 0xffffffffffffffff, - 0x0000000000000000, - - 0x0000000000000000, - 0xffffffffffffffff, - 0x0000000000000000, - 0xffffffffffffffff, - - 0xffffffffffffffff, - 0x0000000000000000, - 0xffffffffffffffff, // this should end up getting truncated - } - - rsa := newRSABitVector(bv) - - testCases := []rsaRankTestCase{ - {true, 0, 0}, {false, 0, 0}, - - {true, 64, 0}, {false, 64, 64}, - {true, 128, 64}, {false, 128, 64}, - {true, 192, 64}, {false, 192, 128}, - {true, 256, 128}, {false, 256, 128}, - - {true, 320, 192}, {false, 256, 128}, - {true, 384, 192}, {false, 384, 192}, - {true, 448, 256}, {false, 448, 192}, - {true, 512, 256}, {false, 512, 256}, - - {true, 576, 256}, {false, 576, 320}, - {true, 640, 320}, {false, 640, 320}, - {true, 704, 320}, {false, 704, 384}, - {true, 768, 384}, {false, 768, 384}, - - {true, 832, 448}, {false, 832, 384}, - {true, 896, 448}, {false, 896, 448}, - {true, 896 + wordSize - numBitsToTruncate - 1, 448 + wordSize - numBitsToTruncate - 1}, {false, 896 + wordSize - numBitsToTruncate - 1, 448}, - } - - for _, tc := range testCases { - rank := rsa.Rank(tc.val, tc.bitPosition) - if rank != tc.expectedRank { - t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) - } - } - -} diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go new file mode 100644 index 000000000..382935c52 --- /dev/null +++ b/bwt/rsa_bitvector.go @@ -0,0 +1,136 @@ +package bwt + +import "math/bits" + +// TODO: doc what rsa is, why these DSAs, and why we take in a bit vector +// TODO: clarks select +type rsaBitVector struct { + bv bitvector + jrc []chunk + jrBitsPerChunk int + jrBitsPerSubChunk int + oneSelectMap map[int]int + zeroSelectMap map[int]int +} + +// TODO: talk about why bv should never be modidifed after building the RSA bit vector +func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { + jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk := buildJacobsonRank(bv) + ones, zeros := buildSelectMaps(bv) + + return rsaBitVector{ + bv: bv, + jrc: jacobsonRankChunks, + jrBitsPerChunk: jrBitsPerChunk, + jrBitsPerSubChunk: jrBitsPerSubChunk, + oneSelectMap: ones, + zeroSelectMap: zeros, + } +} + +func (rsa rsaBitVector) Rank(val bool, i int) int { + rsa.bv.checkBounds(i) + + chunkPos := (i / rsa.jrBitsPerChunk) + chunk := rsa.jrc[chunkPos] + + subChunkPos := (i % rsa.jrBitsPerChunk) / rsa.jrBitsPerSubChunk + subChunk := chunk.subChunks[subChunkPos] + + bitOffset := i % rsa.jrBitsPerSubChunk + + bitSet := rsa.bv.getBitSet(chunkPos*len(rsa.jrc) + subChunkPos) + + shiftRightAmount := uint64(rsa.jrBitsPerSubChunk - bitOffset) + if val { + remaining := bitSet >> shiftRightAmount + return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount64(remaining) + } + remaining := ^bitSet >> shiftRightAmount + + // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 + return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) +} + +func (rsa rsaBitVector) Select(val bool, rank int) (i int, ok bool) { + if val { + i, ok := rsa.oneSelectMap[rank] + return i, ok + } else { + i, ok := rsa.zeroSelectMap[rank] + return i, ok + } +} + +func (rsa rsaBitVector) Access(i int) bool { + return rsa.bv.getBit(i) +} + +type chunk struct { + subChunks []subChunk + onesCumulativeRank int +} + +type subChunk struct { + onesCumulativeRank int +} + +// TODO: talk about easy to read instead vs perf +func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk int) { + // TODO: talk about why this is probably good enough, improves as n grows, gets worse as n gets smaller, and how this fits into a machine instruction, and how this is "simple" + numOfSubChunksPerChunk = 4 + + chunkCumulativeRank := 0 + subChunkCumulativeRank := 0 + + var currSubChunks []subChunk + for i := range inBv.bits { + if len(currSubChunks) == numOfSubChunksPerChunk { + jacobsonRankChunks = append(jacobsonRankChunks, chunk{ + subChunks: currSubChunks, + onesCumulativeRank: chunkCumulativeRank, + }) + + chunkCumulativeRank += subChunkCumulativeRank + + currSubChunks = nil + subChunkCumulativeRank = 0 + } + currSubChunks = append(currSubChunks, subChunk{ + onesCumulativeRank: subChunkCumulativeRank, + }) + + onesCount := bits.OnesCount64(inBv.getBitSet(i)) + subChunkCumulativeRank += onesCount + } + + if currSubChunks != nil { + jacobsonRankChunks = append(jacobsonRankChunks, chunk{ + subChunks: currSubChunks, + onesCumulativeRank: chunkCumulativeRank, + }) + } + + return jacobsonRankChunks, numOfSubChunksPerChunk * wordSize, wordSize +} + +// TODO: talk about how this could be improved memory wise. Talk about how clarks select exists, but keeping it "simple for now" but maybe worth +// making succinct later +func buildSelectMaps(inBv bitvector) (oneSelectMap, zeroSelectMap map[int]int) { + oneSelectMap = make(map[int]int) + zeroSelectMap = make(map[int]int) + oneCount := 0 + zeroCount := 0 + for i := 0; i < inBv.len(); i++ { + bit := inBv.getBit(i) + if bit { + oneSelectMap[oneCount] = i + oneCount++ + } else { + zeroSelectMap[zeroCount] = i + zeroCount++ + } + } + + return oneSelectMap, zeroSelectMap +} diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go new file mode 100644 index 000000000..a61a853ea --- /dev/null +++ b/bwt/rsa_bitvector_test.go @@ -0,0 +1,283 @@ +package bwt + +import "testing" + +type rsaRankTestCase struct { + val bool + bitPosition int + expectedRank int +} + +func TestRSARank_singlePartialChunk(t *testing.T) { + if wordSize != 64 { + t.Skip() + } + + bitsToTruncate := 22 + initialNumberOfBits := wordSize*2 - bitsToTruncate + bv := newBitVector(initialNumberOfBits) + + bv.bits = []uint64{ + 0xffffffff00000000, + 0x00000000ffc00000, + } + + rsa := newRSABitVectorFromBitVector(bv) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + + {true, 64, 32}, {false, 64, 32}, + + {true, 96, 32}, {false, 96, 64}, + + {true, 105, 41}, {false, 105, 64}, + } + + for _, tc := range testCases { + rank := rsa.Rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} + +func TestRSARank_singleCompleteChunk(t *testing.T) { + if wordSize != 64 { + t.Skip() + } + + initialNumberOfBits := wordSize * 4 + bv := newBitVector(initialNumberOfBits) + + bv.bits = []uint64{ + 0x8000000000000001, + 0xff0f30fffacea80d, + 0x90e0a0e0b0e0cf0c, + 0x3d0f064f7206f717, + } + + rsa := newRSABitVectorFromBitVector(bv) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + {true, 1, 1}, {false, 1, 0}, + {true, 2, 1}, {false, 2, 1}, + {true, 3, 1}, {false, 3, 2}, + {true, 62, 1}, {false, 62, 61}, + {true, 63, 1}, {false, 63, 62}, + + {true, 64, 2}, {false, 64, 62}, + {true, 65, 3}, {false, 65, 62}, + {true, 72, 10}, {false, 72, 62}, + {true, 127, 40}, {false, 127, 87}, + + {true, 128, 41}, {false, 128, 87}, + {true, 129, 42}, {false, 129, 87}, + {true, 130, 42}, {false, 130, 88}, + {true, 131, 42}, {false, 131, 89}, + {true, 132, 43}, {false, 132, 89}, + {true, 133, 43}, {false, 133, 90}, + {true, 159, 51}, {false, 159, 108}, + {true, 160, 51}, {false, 160, 109}, + {true, 161, 52}, {false, 161, 109}, + {true, 162, 52}, {false, 162, 110}, + {true, 163, 53}, {false, 163, 110}, + {true, 164, 54}, {false, 164, 110}, + {true, 165, 54}, {false, 165, 111}, + {true, 176, 57}, {false, 176, 119}, + {true, 177, 58}, {false, 177, 119}, + {true, 178, 59}, {false, 178, 119}, + {true, 179, 59}, {false, 179, 120}, + {true, 180, 59}, {false, 180, 121}, + {true, 183, 62}, {false, 183, 121}, + {true, 184, 63}, {false, 184, 121}, + {true, 185, 63}, {false, 185, 122}, + {true, 186, 63}, {false, 186, 123}, + {true, 187, 63}, {false, 187, 124}, + {true, 188, 63}, {false, 188, 125}, + {true, 189, 64}, {false, 189, 125}, + {true, 190, 65}, {false, 190, 125}, + {true, 191, 65}, {false, 191, 126}, + + {true, 192, 65}, {false, 192, 127}, + {true, 193, 65}, {false, 193, 128}, + {true, 194, 65}, {false, 194, 129}, + {true, 195, 66}, {false, 195, 129}, + {true, 196, 67}, {false, 196, 129}, + {true, 248, 94}, {false, 248, 154}, + {true, 249, 94}, {false, 249, 155}, + {true, 250, 94}, {false, 250, 156}, + {true, 251, 94}, {false, 251, 157}, + {true, 252, 95}, {false, 252, 157}, + {true, 253, 95}, {false, 253, 158}, + {true, 254, 96}, {false, 254, 158}, + {true, 255, 97}, {false, 255, 158}, + } + + for _, tc := range testCases { + rank := rsa.Rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} + +func TestRSARank_multipleChunks(t *testing.T) { + numBitsToTruncate := 17 + initialNumberOfBits := wordSize*15 - numBitsToTruncate + bv := newBitVector(initialNumberOfBits) + + bv.bits = []uint64{ + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, // this should end up getting truncated + } + + rsa := newRSABitVectorFromBitVector(bv) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + + {true, 64, 0}, {false, 64, 64}, + {true, 128, 64}, {false, 128, 64}, + {true, 192, 64}, {false, 192, 128}, + {true, 256, 128}, {false, 256, 128}, + + {true, 320, 192}, {false, 256, 128}, + {true, 384, 192}, {false, 384, 192}, + {true, 448, 256}, {false, 448, 192}, + {true, 512, 256}, {false, 512, 256}, + + {true, 576, 256}, {false, 576, 320}, + {true, 640, 320}, {false, 640, 320}, + {true, 704, 320}, {false, 704, 384}, + {true, 768, 384}, {false, 768, 384}, + + {true, 832, 448}, {false, 832, 384}, + {true, 896, 448}, {false, 896, 448}, + {true, 896 + wordSize - numBitsToTruncate - 1, 448 + wordSize - numBitsToTruncate - 1}, {false, 896 + wordSize - numBitsToTruncate - 1, 448}, + } + + for _, tc := range testCases { + rank := rsa.Rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} + +type rsaSelectTestCase struct { + val bool + rank int + expectedPosition int +} + +func TestRSASelect(t *testing.T) { + bitsToTruncate := 17 + initialNumberOfBits := wordSize*4 - bitsToTruncate + bv := newBitVector(initialNumberOfBits) + + bv.bits = []uint64{ + 0x8010000000010000, + 0xfff1ffffffffffff, + 0x0000010000000000, + 0xffffffffffffffff, + } + + rsa := newRSABitVectorFromBitVector(bv) + + testCases := []rsaSelectTestCase{ + {true, 0, 0}, + {true, 1, 11}, + {true, 2, 47}, + {false, 0, 1}, + {false, 1, 2}, + {false, 3, 4}, + {false, 8, 9}, + {false, 9, 10}, + {false, 10, 12}, + {false, 11, 13}, + {false, 60, 63}, + + {true, 3, 64}, + {true, 9, 70}, + {true, 13, 74}, + {true, 14, 75}, + {true, 15, 79}, + {true, 16, 80}, + {true, 63, 127}, + {false, 61, 76}, + {false, 62, 77}, + {false, 63, 78}, + + {true, 64, 151}, + {false, 64, 128}, + {false, 126, 191}, + + {true, 65, 192}, + {true, 111, 238}, + } + + for _, tc := range testCases { + position, ok := rsa.Select(tc.val, tc.rank) + + if !ok { + t.Fatalf("expected select(%t, %d) to be %d but went out of range", tc.val, tc.rank, tc.expectedPosition) + } + + if position != tc.expectedPosition { + t.Fatalf("expected select(%t, %d) to be %d but got %d", tc.val, tc.rank, tc.expectedPosition, position) + } + } +} + +func TestRSASelect_notOk(t *testing.T) { + bitsToTruncate := 17 + initialNumberOfBits := wordSize*4 - bitsToTruncate + bv := newBitVector(initialNumberOfBits) + + bv.bits = []uint64{ + 0x8010000000010000, + 0xfff1ffffffffffff, + 0x0000010000000000, + 0xffffffffffffffff, + } + + rsa := newRSABitVectorFromBitVector(bv) + + if _, ok := rsa.Select(true, -1); ok { + t.Fatalf("expected select(true, -1) to be not ok but somehow returned a value") + } + + pos, ok := rsa.Select(true, 111) + if !ok { + t.Fatalf("expected select(true, 111) to be ok but somehow got not ok") + } + + if pos != 238 { + t.Fatalf("expected select(true, 111) to be 238 ok but got") + } + + if _, ok := rsa.Select(true, 239); ok { + t.Fatalf("expected select(true, -1) to be not ok but somehow returned a value") + } +} diff --git a/bwt/wavelet.go b/bwt/wavelet.go new file mode 100644 index 000000000..ffb49871c --- /dev/null +++ b/bwt/wavelet.go @@ -0,0 +1,164 @@ +package bwt + +import ( + "math" + + "golang.org/x/exp/slices" +) + +type WaveletTree struct { + nodes []node + alpha []charInfo +} + +type node struct { + data rsaBitVector + char byte +} + +func (n node) isLeaf() bool { + return n.char != 0 +} + +type charInfo struct { + char byte + maxRank int + path bitvector +} + +func NewWaveletTreeFromString(str string) WaveletTree { + bytes := []byte(str) + + alpha := getCharInfoDescByRank(bytes) + nodes := buildWaveletTree(0, alpha, bytes) + + return WaveletTree{ + nodes: nodes, + } +} + +func buildWaveletTree(currentLevel int, alpha []charInfo, bytes []byte) []node { + if len(alpha) == 0 { + return nil + } + + if len(alpha) == 1 { + return []node{ + {char: alpha[0].char}, + } + } + + if len(alpha) == 2 { + return []node{ + {char: alpha[0].char}, + {char: alpha[1].char}, + } + } + + leftAlpha, rightAlpha := partitionAlpha(currentLevel, alpha) + + var leftBytes []byte + var rightBytes []byte + + bv := newBitVector(len(bytes)) + for i := range bytes { + if isInAlpha(rightAlpha, bytes[i]) { + bv.setBit(i, true) + rightBytes = append(rightBytes, bytes[i]) + } else { + leftBytes = append(leftBytes, bytes[i]) + } + } + + n := node{ + data: newRSABitVectorFromBitVector(bv), + } + + leftTree := buildWaveletTree(currentLevel+1, leftAlpha, leftBytes) + rightTree := buildWaveletTree(currentLevel+1, rightAlpha, rightBytes) + + tree := append([]node{n}, leftTree...) + tree = append(tree, rightTree...) + + return tree +} + +func isInAlpha(alpha []charInfo, b byte) bool { + for _, a := range alpha { + if a.char == b { + return true + } + } + return false +} + +// TODO: talk about arranging OG alpha such that we minimize memory +func partitionAlpha(currentLevel int, alpha []charInfo) (left []charInfo, right []charInfo) { + for _, a := range alpha { + if a.path.getBit(a.path.len() - 1 - currentLevel) { + right = append(right, a) + } else { + left = append(left, a) + } + } + + return left, right +} + +func getLeft(nodePos int) int { + return nodePos*2 + 1 +} + +func getRight(nodePos int) int { + return nodePos*2 + 1 +} + +func getParent(nodePos int) int { + return (nodePos + 1) / 2 +} + +// alphabets are expected to be small for real usecases +func getCharInfoDescByRank(b []byte) []charInfo { + ranks := make(map[byte]int) + for i := 0; i < len(b); i++ { + if _, ok := ranks[b[i]]; ok { + ranks[b[i]] += 1 + } else { + ranks[b[i]] = 0 + } + } + + var sortedInfo []charInfo + for k := range ranks { + sortedInfo = append(sortedInfo, charInfo{char: k, maxRank: ranks[k]}) + } + + slices.SortFunc(sortedInfo, func(a, b charInfo) bool { + return a.maxRank > b.maxRank + }) + + numOfBits := getTreeHeight(sortedInfo) + for i := range sortedInfo { + bv := newBitVector(numOfBits) + encodeCharPathIntoBitVector(bv, uint64(i)) + sortedInfo[i].path = bv + } + + return sortedInfo +} + +func encodeCharPathIntoBitVector(bv bitvector, n uint64) { + shift := 0 + for n>>shift > 0 { + if n>>shift%2 == 1 { + bv.setBit(bv.len()-1-shift, true) + } else { + bv.setBit(bv.len()-1-shift, false) + } + shift++ + } +} + +func getTreeHeight(alpha []charInfo) int { + return int(math.Log2(float64(len(alpha)))) + 1 +} diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go new file mode 100644 index 000000000..2e531b74f --- /dev/null +++ b/bwt/wavelet_test.go @@ -0,0 +1 @@ +package bwt From a9ba8aed17b1b64772f2a88f6b4c25fff7803e3d Mon Sep 17 00:00:00 2001 From: Trenton Date: Sun, 3 Dec 2023 00:07:51 -0500 Subject: [PATCH 10/60] Simple wavelet tree with access --- bwt/rsa_bitvector_test.go | 1 + bwt/wavelet.go | 70 +++++++++++++++++++++++++-------------- bwt/wavelet_test.go | 67 +++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 25 deletions(-) diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index a61a853ea..f273292cf 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -26,6 +26,7 @@ func TestRSARank_singlePartialChunk(t *testing.T) { testCases := []rsaRankTestCase{ {true, 0, 0}, {false, 0, 0}, + {true, 1, 1}, {false, 0, 0}, {true, 64, 32}, {false, 64, 32}, diff --git a/bwt/wavelet.go b/bwt/wavelet.go index ffb49871c..1bd8d06bf 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -6,14 +6,35 @@ import ( "golang.org/x/exp/slices" ) -type WaveletTree struct { - nodes []node +type waveletTree struct { + root *node alpha []charInfo } +// TODO: figure out empty nodes case +// TODO: figure out out of bounds case +func (wt waveletTree) Access(i int) byte { + currNode := wt.root + for !currNode.isLeaf() { + bit := currNode.data.Access(i) + i = currNode.data.Rank(bit, i) + if bit { + currNode = currNode.right + } else { + currNode = currNode.left + } + } + return currNode.char +} + +// TODO: talk about how we could probably greaty improve performance with one big bit vector that +// represents the whole tree by concatenation the level order traversal of each node's bits type node struct { - data rsaBitVector - char byte + data rsaBitVector + char byte + parent *node + left *node + right *node } func (n node) isLeaf() bool { @@ -26,33 +47,25 @@ type charInfo struct { path bitvector } -func NewWaveletTreeFromString(str string) WaveletTree { +func NewWaveletTreeFromString(str string) waveletTree { bytes := []byte(str) alpha := getCharInfoDescByRank(bytes) - nodes := buildWaveletTree(0, alpha, bytes) + root := buildWaveletTree(0, alpha, bytes) - return WaveletTree{ - nodes: nodes, + return waveletTree{ + root: root, + alpha: alpha, } } -func buildWaveletTree(currentLevel int, alpha []charInfo, bytes []byte) []node { +func buildWaveletTree(currentLevel int, alpha []charInfo, bytes []byte) *node { if len(alpha) == 0 { return nil } if len(alpha) == 1 { - return []node{ - {char: alpha[0].char}, - } - } - - if len(alpha) == 2 { - return []node{ - {char: alpha[0].char}, - {char: alpha[1].char}, - } + return &node{char: alpha[0].char} } leftAlpha, rightAlpha := partitionAlpha(currentLevel, alpha) @@ -70,17 +83,24 @@ func buildWaveletTree(currentLevel int, alpha []charInfo, bytes []byte) []node { } } - n := node{ + root := &node{ data: newRSABitVectorFromBitVector(bv), } leftTree := buildWaveletTree(currentLevel+1, leftAlpha, leftBytes) rightTree := buildWaveletTree(currentLevel+1, rightAlpha, rightBytes) - tree := append([]node{n}, leftTree...) - tree = append(tree, rightTree...) + root.left = leftTree + root.right = rightTree + + if leftTree != nil { + leftTree.parent = root + } + if rightTree != nil { + rightTree.parent = root + } - return tree + return root } func isInAlpha(alpha []charInfo, b byte) bool { @@ -110,7 +130,7 @@ func getLeft(nodePos int) int { } func getRight(nodePos int) int { - return nodePos*2 + 1 + return nodePos*2 + 2 } func getParent(nodePos int) int { @@ -133,7 +153,7 @@ func getCharInfoDescByRank(b []byte) []charInfo { sortedInfo = append(sortedInfo, charInfo{char: k, maxRank: ranks[k]}) } - slices.SortFunc(sortedInfo, func(a, b charInfo) bool { + slices.SortStableFunc(sortedInfo, func(a, b charInfo) bool { return a.maxRank > b.maxRank }) diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index 2e531b74f..17e675b92 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -1 +1,68 @@ package bwt + +import "testing" + +type WaveletTreeAccessTestCase struct { + pos int + expected string +} + +func TestWaveletTree_Access(t *testing.T) { + testStr := "AAAACCCCTTTTGGGG" + "ACTG" + "TGCA" + "TTAA" + "CCGG" + "GGGGTTTTCCCCAAAA" + wt := NewWaveletTreeFromString(testStr) + + testCases := []WaveletTreeAccessTestCase{ + {0, "A"}, + {3, "A"}, + {4, "C"}, + {7, "C"}, + {8, "T"}, + {9, "T"}, + {11, "T"}, + {12, "G"}, + {13, "G"}, + {15, "G"}, + + {16, "A"}, + {17, "C"}, + {18, "T"}, + {19, "G"}, + + {20, "T"}, + {21, "G"}, + {22, "C"}, + {23, "A"}, + + {24, "T"}, + {25, "T"}, + {26, "A"}, + {27, "A"}, + + {28, "C"}, + {29, "C"}, + {30, "G"}, + {31, "G"}, + + {32, "G"}, + {35, "G"}, + {36, "T"}, + {39, "T"}, + {40, "C"}, + {41, "C"}, + {43, "C"}, + {44, "A"}, + {46, "A"}, + {47, "A"}, + } + + for _, tc := range testCases { + actual := string(wt.Access(tc.pos)) + if actual != tc.expected { + t.Fatalf("expected access(%d) to be %s but got %s", tc.pos, tc.expected, actual) + } + } +} + +func TestWaveletTree_Rank(t *testing.T) { + +} From 79fefe2b1256364d6fadf8a4275c5998e0240ef1 Mon Sep 17 00:00:00 2001 From: Trenton Date: Sun, 3 Dec 2023 02:45:07 -0500 Subject: [PATCH 11/60] wavelet fix access, add select, fix rsa bitvector select --- bwt/rsa_bitvector.go | 4 +-- bwt/rsa_bitvector_test.go | 69 +++++++++++++++++++-------------------- bwt/wavelet.go | 45 +++++++++++++++++++++---- bwt/wavelet_test.go | 51 +++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 44 deletions(-) diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 382935c52..815bbd9b1 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -124,11 +124,11 @@ func buildSelectMaps(inBv bitvector) (oneSelectMap, zeroSelectMap map[int]int) { for i := 0; i < inBv.len(); i++ { bit := inBv.getBit(i) if bit { - oneSelectMap[oneCount] = i oneCount++ + oneSelectMap[oneCount] = i } else { - zeroSelectMap[zeroCount] = i zeroCount++ + zeroSelectMap[zeroCount] = i } } diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index f273292cf..56e203142 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -26,7 +26,6 @@ func TestRSARank_singlePartialChunk(t *testing.T) { testCases := []rsaRankTestCase{ {true, 0, 0}, {false, 0, 0}, - {true, 1, 1}, {false, 0, 0}, {true, 64, 32}, {false, 64, 32}, @@ -207,35 +206,35 @@ func TestRSASelect(t *testing.T) { rsa := newRSABitVectorFromBitVector(bv) testCases := []rsaSelectTestCase{ - {true, 0, 0}, - {true, 1, 11}, - {true, 2, 47}, - {false, 0, 1}, - {false, 1, 2}, - {false, 3, 4}, - {false, 8, 9}, - {false, 9, 10}, - {false, 10, 12}, - {false, 11, 13}, - {false, 60, 63}, - - {true, 3, 64}, - {true, 9, 70}, - {true, 13, 74}, - {true, 14, 75}, - {true, 15, 79}, - {true, 16, 80}, - {true, 63, 127}, - {false, 61, 76}, - {false, 62, 77}, - {false, 63, 78}, - - {true, 64, 151}, - {false, 64, 128}, - {false, 126, 191}, - - {true, 65, 192}, - {true, 111, 238}, + {true, 1, 0}, + {true, 2, 11}, + {true, 3, 47}, + {false, 1, 1}, + {false, 2, 2}, + {false, 4, 4}, + {false, 9, 9}, + {false, 10, 10}, + {false, 11, 12}, + {false, 12, 13}, + {false, 61, 63}, + + {true, 4, 64}, + {true, 10, 70}, + {true, 14, 74}, + {true, 15, 75}, + {true, 16, 79}, + {true, 17, 80}, + {true, 64, 127}, + {false, 62, 76}, + {false, 63, 77}, + {false, 64, 78}, + + {true, 65, 151}, + {false, 65, 128}, + {false, 127, 191}, + + {true, 66, 192}, + {true, 112, 238}, } for _, tc := range testCases { @@ -265,20 +264,20 @@ func TestRSASelect_notOk(t *testing.T) { rsa := newRSABitVectorFromBitVector(bv) - if _, ok := rsa.Select(true, -1); ok { - t.Fatalf("expected select(true, -1) to be not ok but somehow returned a value") + if _, ok := rsa.Select(true, 0); ok { + t.Fatalf("expected select(true, 0) to be not ok but somehow returned a value") } - pos, ok := rsa.Select(true, 111) + pos, ok := rsa.Select(true, 112) if !ok { t.Fatalf("expected select(true, 111) to be ok but somehow got not ok") } if pos != 238 { - t.Fatalf("expected select(true, 111) to be 238 ok but got") + t.Fatalf("expected select(true, 111) to be 238 but got %d", pos) } if _, ok := rsa.Select(true, 239); ok { - t.Fatalf("expected select(true, -1) to be not ok but somehow returned a value") + t.Fatalf("expected select(true, 239) to be not ok but somehow returned a value") } } diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 1bd8d06bf..5efd0be6d 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -14,17 +14,48 @@ type waveletTree struct { // TODO: figure out empty nodes case // TODO: figure out out of bounds case func (wt waveletTree) Access(i int) byte { - currNode := wt.root - for !currNode.isLeaf() { - bit := currNode.data.Access(i) - i = currNode.data.Rank(bit, i) + curr := wt.root + for !curr.isLeaf() { + bit := curr.data.Access(i) + i = curr.data.Rank(bit, i) if bit { - currNode = currNode.right + curr = curr.right } else { - currNode = currNode.left + curr = curr.left } } - return currNode.char + return curr.char +} + +// TODO: deal with bad lookup char +// TODO: deal with somehow bad path +func (wt waveletTree) Rank(char byte, i int) int { + curr := wt.root + ci := wt.lookupCharInfo(char) + + level := 0 + var rank int + for !curr.isLeaf() { + pathBit := ci.path.getBit(ci.path.len() - 1 - level) + rank = curr.data.Rank(pathBit, i) + if pathBit { + curr = curr.right + } else { + curr = curr.left + } + level++ + i = rank + } + return rank +} + +func (wt waveletTree) lookupCharInfo(char byte) charInfo { + for i := range wt.alpha { + if wt.alpha[i].char == char { + return wt.alpha[i] + } + } + panic("better messaging or handling") } // TODO: talk about how we could probably greaty improve performance with one big bit vector that diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index 17e675b92..def950d47 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -63,6 +63,57 @@ func TestWaveletTree_Access(t *testing.T) { } } +type WaveletTreeRankTestCase struct { + char string + pos int + expected int +} + func TestWaveletTree_Rank(t *testing.T) { + testStr := "AAAACCCCTTTTGGGG" + "ACTG" + "TGCA" + "TTAA" + "CCGG" + "GGGGTTTTCCCCAAAA" + wt := NewWaveletTreeFromString(testStr) + + testCases := []WaveletTreeRankTestCase{ + {"A", 0, 0}, + {"A", 2, 2}, + {"A", 3, 3}, + {"A", 8, 4}, + {"C", 4, 0}, + {"C", 6, 2}, + {"C", 12, 4}, + {"T", 2, 0}, + {"T", 8, 0}, + {"T", 12, 4}, + {"T", 15, 4}, + {"G", 15, 3}, + + {"A", 16, 4}, + {"A", 17, 5}, + {"G", 16, 4}, + + {"T", 20, 5}, + {"A", 23, 5}, + {"T", 24, 6}, + {"T", 27, 8}, + + {"C", 28, 6}, + {"G", 31, 7}, + + {"G", 32, 8}, + {"G", 33, 9}, + {"T", 36, 8}, + {"T", 38, 10}, + {"C", 40, 8}, + {"C", 43, 11}, + {"A", 44, 8}, + {"A", 47, 11}, + } + + for _, tc := range testCases { + actual := wt.Rank(tc.char[0], tc.pos) + if actual != tc.expected { + t.Fatalf("expected rank(%s, %d) to be %d but got %d", tc.char, tc.pos, tc.expected, actual) + } + } } From 8280e7a8be20f35f09c32ac3797340b37a7148bc Mon Sep 17 00:00:00 2001 From: Trenton Date: Sun, 3 Dec 2023 22:28:07 -0500 Subject: [PATCH 12/60] fix select again, select for wavelet tree --- bwt/rsa_bitvector.go | 4 +-- bwt/rsa_bitvector_test.go | 64 +++++++++++++++++++-------------------- bwt/wavelet.go | 32 ++++++++++++++++++-- bwt/wavelet_test.go | 48 +++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 36 deletions(-) diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 815bbd9b1..382935c52 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -124,11 +124,11 @@ func buildSelectMaps(inBv bitvector) (oneSelectMap, zeroSelectMap map[int]int) { for i := 0; i < inBv.len(); i++ { bit := inBv.getBit(i) if bit { - oneCount++ oneSelectMap[oneCount] = i + oneCount++ } else { - zeroCount++ zeroSelectMap[zeroCount] = i + zeroCount++ } } diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index 56e203142..75cdd672e 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -206,35 +206,35 @@ func TestRSASelect(t *testing.T) { rsa := newRSABitVectorFromBitVector(bv) testCases := []rsaSelectTestCase{ - {true, 1, 0}, - {true, 2, 11}, - {true, 3, 47}, - {false, 1, 1}, - {false, 2, 2}, - {false, 4, 4}, - {false, 9, 9}, - {false, 10, 10}, - {false, 11, 12}, - {false, 12, 13}, - {false, 61, 63}, - - {true, 4, 64}, - {true, 10, 70}, - {true, 14, 74}, - {true, 15, 75}, - {true, 16, 79}, - {true, 17, 80}, - {true, 64, 127}, - {false, 62, 76}, - {false, 63, 77}, - {false, 64, 78}, - - {true, 65, 151}, - {false, 65, 128}, - {false, 127, 191}, - - {true, 66, 192}, - {true, 112, 238}, + {true, 0, 0}, + {true, 1, 11}, + {true, 2, 47}, + {false, 0, 1}, + {false, 1, 2}, + {false, 3, 4}, + {false, 8, 9}, + {false, 9, 10}, + {false, 10, 12}, + {false, 11, 13}, + {false, 60, 63}, + + {true, 3, 64}, + {true, 9, 70}, + {true, 13, 74}, + {true, 14, 75}, + {true, 15, 79}, + {true, 16, 80}, + {true, 63, 127}, + {false, 61, 76}, + {false, 62, 77}, + {false, 63, 78}, + + {true, 64, 151}, + {false, 64, 128}, + {false, 126, 191}, + + {true, 65, 192}, + {true, 111, 238}, } for _, tc := range testCases { @@ -264,11 +264,11 @@ func TestRSASelect_notOk(t *testing.T) { rsa := newRSABitVectorFromBitVector(bv) - if _, ok := rsa.Select(true, 0); ok { - t.Fatalf("expected select(true, 0) to be not ok but somehow returned a value") + if _, ok := rsa.Select(true, -1); ok { + t.Fatalf("expected select(true, -1) to be not ok but somehow returned a value") } - pos, ok := rsa.Select(true, 112) + pos, ok := rsa.Select(true, 111) if !ok { t.Fatalf("expected select(true, 111) to be ok but somehow got not ok") } diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 5efd0be6d..822695ede 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -32,7 +32,6 @@ func (wt waveletTree) Access(i int) byte { func (wt waveletTree) Rank(char byte, i int) int { curr := wt.root ci := wt.lookupCharInfo(char) - level := 0 var rank int for !curr.isLeaf() { @@ -49,6 +48,32 @@ func (wt waveletTree) Rank(char byte, i int) int { return rank } +func (wt waveletTree) Select(char byte, rank int) int { + curr := wt.root + ci := wt.lookupCharInfo(char) + level := 0 + + for !curr.isLeaf() { + pathBit := ci.path.getBit(ci.path.len() - 1 - level) + if pathBit { + curr = curr.right + } else { + curr = curr.left + } + level++ + } + + for curr.parent != nil { + curr = curr.parent + level-- + pathBit := ci.path.getBit(ci.path.len() - 1 - level) + // TODO: do we really need the ok on the select? + rank, _ = curr.data.Select(pathBit, rank) + } + + return rank +} + func (wt waveletTree) lookupCharInfo(char byte) charInfo { for i := range wt.alpha { if wt.alpha[i].char == char { @@ -184,7 +209,10 @@ func getCharInfoDescByRank(b []byte) []charInfo { sortedInfo = append(sortedInfo, charInfo{char: k, maxRank: ranks[k]}) } - slices.SortStableFunc(sortedInfo, func(a, b charInfo) bool { + slices.SortFunc(sortedInfo, func(a, b charInfo) bool { + if a.maxRank == b.maxRank { + return a.char < b.char + } return a.maxRank > b.maxRank }) diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index def950d47..76fd4e6bc 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -117,3 +117,51 @@ func TestWaveletTree_Rank(t *testing.T) { } } } + +type WaveletTreeSelectTestCase struct { + char string + rank int + expected int +} + +func TestWaveletTree_Select(t *testing.T) { + testStr := "AAAACCCCTTTTGGGG" + "ACTG" + "TGCA" + "TTAA" + "CCGG" + "GGGGTTTTCCCCAAAA" + wt := NewWaveletTreeFromString(testStr) + + testCases := []WaveletTreeSelectTestCase{ + {"A", 0, 0}, + {"A", 1, 1}, + {"A", 2, 2}, + {"A", 3, 3}, + {"C", 0, 4}, + {"C", 3, 7}, + + {"A", 4, 16}, + {"C", 4, 17}, + {"T", 4, 18}, + {"G", 4, 19}, + + {"T", 5, 20}, + {"G", 5, 21}, + {"C", 5, 22}, + {"A", 5, 23}, + + {"T", 6, 24}, + {"T", 7, 25}, + {"A", 6, 26}, + + {"C", 6, 28}, + {"G", 6, 30}, + {"G", 7, 31}, + + {"G", 8, 32}, + {"A", 11, 47}, + } + + for _, tc := range testCases { + actual := wt.Select(tc.char[0], tc.rank) + if actual != tc.expected { + t.Fatalf("expected select(%s, %d) to be %d but got %d", tc.char, tc.rank, tc.expected, actual) + } + } +} From 6802a87d3409caab23811e443d93a46a07e03472 Mon Sep 17 00:00:00 2001 From: Trenton Date: Mon, 4 Dec 2023 23:35:25 -0500 Subject: [PATCH 13/60] simple FM count --- bwt/bwt.go | 163 ++++++++++++++++++------------------------- bwt/bwt_test.go | 38 +++++----- bwt/rsa_bitvector.go | 2 - 3 files changed, 85 insertions(+), 118 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index ff5477954..dcc6cd923 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -1,136 +1,105 @@ package bwt import ( + "strings" + "golang.org/x/exp/slices" ) const nullChar = "$" -type window struct { - char byte - start int - end int -} - -func (w window) includes(innerWindow window) bool { - return w.start <= innerWindow.start && innerWindow.end <= w.end -} - // BWT Burrow Wheeler Transform // Data structure that compactly represents any sequence of characters and // allows for sub sequence querying. type BWT struct { - // First col - f []window - // Last col - l map[byte][]window - // index of the original sequence in the suffix array from BTW construction - indexOfOriginalSequenceFromSuffixArray int + skipList []skipEntry + l waveletTree } -func New(sequence string) BWT { - f, l, idx := build(sequence) - return BWT{ - f: f, - l: l, - indexOfOriginalSequenceFromSuffixArray: idx, +func (bwt BWT) Count(pattern string) int { + skip, ok := bwt.lookupSkip(pattern[len(pattern)-1]) + if !ok { + return 0 } -} - -func (b BWT) QueryExistence(substr string) bool { - for i := len(substr) - 1; i >= 0; i-- { - win, ok := b.getFWindow(substr[i]) - if !ok { - return false + nextRange := skip.openEndedInterval + for i := 1; i < len(pattern); i++ { + if nextRange.end-nextRange.start <= 0 { + return 0 } - if i == 0 && ok { - return true - } + currChar := pattern[len(pattern)-1-i] - valid := b.charInLExistsInFWindow(win, substr[i-1]) + currCharRangeStart := bwt.l.Rank(currChar, nextRange.start) + currCharRangeEnd := bwt.l.Rank(currChar, nextRange.end) - if !valid { - return false + nextCharSkip, ok := bwt.lookupSkip(currChar) + if !ok { + return 0 } - } - // shouldn't be getting here - return false + nextRange.start = nextCharSkip.openEndedInterval.start + currCharRangeStart + nextRange.end = nextCharSkip.openEndedInterval.start + currCharRangeEnd + } + return nextRange.end - nextRange.start } -func (b BWT) charInLExistsInFWindow(w window, char byte) bool { - if windows, ok := b.l[char]; ok { - for i := range windows { - if w.includes(windows[i]) { - return true - } +func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { + for i := range bwt.skipList { + if bwt.skipList[i].char == c { + return bwt.skipList[i], true } } - return false + return skipEntry{}, false } -// Alphabets should be small -func (b BWT) getFWindow(char byte) (w window, ok bool) { - for i := range b.f { - if b.f[i].char == char { - return b.f[i], true - } - } - return window{}, false +type interval struct { + start int + end int } -func build(s string) (f []window, l map[byte][]window, indexOfOriginalSequenceInSuffixArray int) { - s += nullChar - prefixArray := make([]string, len(s)) - for i := 0; i < len(s); i++ { - prefixArray[i] = s[len(s)-i-1:] +type skipEntry struct { + char byte + openEndedInterval interval +} + +func New(sequence string) BWT { + sequence += nullChar + + prefixArray := make([]string, len(sequence)) + for i := 0; i < len(sequence); i++ { + prefixArray[i] = sequence[len(sequence)-i-1:] } slices.Sort(prefixArray) - l = make(map[byte][]window) - prevFChar := prefixArray[0][0] - prevFWin := window{char: prevFChar, start: 0} - prevLChar := s[getBWTIndex(len(s), len(prefixArray[0]))] - prevLWin := window{char: prevLChar, start: 0} - for i := 1; i < len(prefixArray); i++ { - currFChar := prefixArray[i][0] - if prevFChar != currFChar { - prevFWin.end = i - 1 - f = append(f, prevFWin) - prevFChar = currFChar - prevFWin = window{char: currFChar, start: i} - } - - currLChar := s[getBWTIndex(len(s), len(prefixArray[i]))] - if prevLChar != currLChar { - prevLWin.end = i - 1 - if _, ok := l[prevLChar]; ok { - l[prevLChar] = append(l[prevLChar], prevLWin) - } else { - l[prevLChar] = []window{prevLWin} - } - prevLChar = currLChar - prevLWin = window{char: currLChar, start: i} - } - if len(s) == len(prefixArray[i]) { - indexOfOriginalSequenceInSuffixArray = i - } - } - prevFWin.end = len(s) - 1 - f = append(f, prevFWin) - prevLWin.end = len(s) - 1 - if _, ok := l[prevLChar]; ok { - l[prevLChar] = append(l[prevLChar], prevLWin) - } else { - l[prevLChar] = []window{prevLWin} + lastColBuilder := strings.Builder{} + for i := 0; i < len(prefixArray); i++ { + currChar := sequence[getBWTIndex(len(sequence), len(prefixArray[i]))] + lastColBuilder.WriteByte(currChar) } - if indexOfOriginalSequenceInSuffixArray == 0 { - indexOfOriginalSequenceInSuffixArray = len(s) - 1 + + return BWT{ + skipList: buildSkipList(prefixArray), + l: NewWaveletTreeFromString(lastColBuilder.String()), } +} - return f, l, indexOfOriginalSequenceInSuffixArray +func buildSkipList(prefixArray []string) []skipEntry { + prevChar := prefixArray[0][0] + skipList := []skipEntry{{char: prevChar, openEndedInterval: interval{start: 0}}} + for i := 1; i < len(prefixArray); i++ { + currChar := prefixArray[i][0] + if currChar != prevChar { + skipList[len(skipList)-1].openEndedInterval.end = i + skipList = append(skipList, skipEntry{ + char: currChar, + openEndedInterval: interval{start: i}, + }) + prevChar = currChar + } + } + skipList[len(skipList)-1].openEndedInterval.end = len(prefixArray) + return skipList } func getBWTIndex(lenOfSequenceBeingBuilt, lenOfSuffixArrayVisited int) int { diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index 922829a98..10f76e17f 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -4,32 +4,32 @@ import ( "testing" ) -type QueryTest struct { +type BWTCountTestCase struct { seq string - expected bool + expected int } -func TestQueryBWT(t *testing.T) { +func TestBWT_Count(t *testing.T) { bwt := New("BANANA") - testTable := []QueryTest{ - {"NANA", true}, - {"ANA", true}, - {"NA", true}, - {"B", true}, - {"N", true}, - {"BA", true}, - {"ANANA", true}, - {"QWERTY", false}, - {"ANANANA", false}, - {"ABCD", false}, - {"ABA", false}, + testTable := []BWTCountTestCase{ + {"NANA", 1}, + {"ANA", 2}, + {"NA", 2}, + {"B", 1}, + {"N", 2}, + {"BA", 1}, + {"ANANA", 1}, + {"QWERTY", 0}, + {"ANANANA", 0}, + {"ABCD", 0}, + {"ABA", 0}, } for _, v := range testTable { - res := bwt.QueryExistence(v.seq) - if res != v.expected { - t.Fatalf("Test=%s ExpectedQueryExistence=%v Received=%v", v.seq, v.expected, res) + count := bwt.Count(v.seq) + if count != v.expected { + t.Fatalf("seq=%s expectedCount=%v actualCount=%v", v.seq, v.expected, count) } } } @@ -64,6 +64,6 @@ func BenchmarkBWTQueryPower12(b *testing.B) { //go:noinline func BaseBenchmarkBWTQuery(bwt BWT, seq string, b *testing.B) { for n := 0; n < b.N; n++ { - bwt.QueryExistence(seq) + bwt.Count(seq) } } diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 382935c52..9f7a4883b 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -29,8 +29,6 @@ func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { } func (rsa rsaBitVector) Rank(val bool, i int) int { - rsa.bv.checkBounds(i) - chunkPos := (i / rsa.jrBitsPerChunk) chunk := rsa.jrc[chunkPos] From 5ee364b03a062fbfd83d75b265ce889f4864c774 Mon Sep 17 00:00:00 2001 From: Trenton Date: Tue, 5 Dec 2023 23:24:11 -0500 Subject: [PATCH 14/60] got count working, but had to throw out jacobsons --- bwt/bwt.go | 33 ++++++------ bwt/bwt_test.go | 28 +++++----- bwt/rsa_bitvector.go | 61 +++++++++++++++------- bwt/rsa_bitvector_test.go | 106 ++++++++++++++++++++++++++------------ bwt/wavelet_test.go | 31 ++++++++++- 5 files changed, 176 insertions(+), 83 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index dcc6cd923..519180e57 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -17,30 +17,21 @@ type BWT struct { } func (bwt BWT) Count(pattern string) int { - skip, ok := bwt.lookupSkip(pattern[len(pattern)-1]) - if !ok { - return 0 - } - nextRange := skip.openEndedInterval - for i := 1; i < len(pattern); i++ { - if nextRange.end-nextRange.start <= 0 { + searchRange := interval{start: 0, end: bwt.getLenOfOriginalString()} + for i := 0; i < len(pattern); i++ { + if searchRange.end-searchRange.start <= 0 { return 0 } - currChar := pattern[len(pattern)-1-i] - - currCharRangeStart := bwt.l.Rank(currChar, nextRange.start) - currCharRangeEnd := bwt.l.Rank(currChar, nextRange.end) - - nextCharSkip, ok := bwt.lookupSkip(currChar) + c := pattern[len(pattern)-1-i] + skip, ok := bwt.lookupSkip(c) if !ok { return 0 } - - nextRange.start = nextCharSkip.openEndedInterval.start + currCharRangeStart - nextRange.end = nextCharSkip.openEndedInterval.start + currCharRangeEnd + searchRange.start = skip.openEndedInterval.start + bwt.l.Rank(c, searchRange.start) + searchRange.end = skip.openEndedInterval.start + bwt.l.Rank(c, searchRange.end) } - return nextRange.end - nextRange.start + return searchRange.end - searchRange.start } func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { @@ -52,6 +43,10 @@ func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { return skipEntry{}, false } +func (bwt BWT) getLenOfOriginalString() int { + return bwt.skipList[len(bwt.skipList)-1].openEndedInterval.end +} + type interval struct { start int end int @@ -77,6 +72,10 @@ func New(sequence string) BWT { currChar := sequence[getBWTIndex(len(sequence), len(prefixArray[i]))] lastColBuilder.WriteByte(currChar) } + fb := strings.Builder{} + for i := 0; i < len(prefixArray); i++ { + fb.WriteByte(prefixArray[i][0]) + } return BWT{ skipList: buildSkipList(prefixArray), diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index 10f76e17f..d9039bd87 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -1,6 +1,7 @@ package bwt import ( + "strings" "testing" ) @@ -9,21 +10,24 @@ type BWTCountTestCase struct { expected int } +const augmentedQuickBrownFoxTest = "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" + +var threeAugmentedQuickBrownFoxTest = strings.Join([]string{augmentedQuickBrownFoxTest, augmentedQuickBrownFoxTest, augmentedQuickBrownFoxTest}, "") + func TestBWT_Count(t *testing.T) { - bwt := New("BANANA") + bwt := New(threeAugmentedQuickBrownFoxTest) testTable := []BWTCountTestCase{ - {"NANA", 1}, - {"ANA", 2}, - {"NA", 2}, - {"B", 1}, - {"N", 2}, - {"BA", 1}, - {"ANANA", 1}, - {"QWERTY", 0}, - {"ANANANA", 0}, - {"ABCD", 0}, - {"ABA", 0}, + {"uick", 3}, + {"the", 6}, + {"over", 6}, + {"own", 12}, + {"ana", 6}, + {"an", 9}, + {"na", 9}, + {"rown", 6}, + {"townthe", 2}, + {"zzz", 0}, } for _, v := range testTable { diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 9f7a4883b..5212fbbe7 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -6,6 +6,7 @@ import "math/bits" // TODO: clarks select type rsaBitVector struct { bv bitvector + totalOnesRank int jrc []chunk jrBitsPerChunk int jrBitsPerSubChunk int @@ -15,11 +16,12 @@ type rsaBitVector struct { // TODO: talk about why bv should never be modidifed after building the RSA bit vector func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { - jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk := buildJacobsonRank(bv) + jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk, totalOnesRank := buildJacobsonRank(bv) ones, zeros := buildSelectMaps(bv) return rsaBitVector{ bv: bv, + totalOnesRank: totalOnesRank, jrc: jacobsonRankChunks, jrBitsPerChunk: jrBitsPerChunk, jrBitsPerSubChunk: jrBitsPerSubChunk, @@ -29,25 +31,42 @@ func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { } func (rsa rsaBitVector) Rank(val bool, i int) int { - chunkPos := (i / rsa.jrBitsPerChunk) - chunk := rsa.jrc[chunkPos] - - subChunkPos := (i % rsa.jrBitsPerChunk) / rsa.jrBitsPerSubChunk - subChunk := chunk.subChunks[subChunkPos] - - bitOffset := i % rsa.jrBitsPerSubChunk - - bitSet := rsa.bv.getBitSet(chunkPos*len(rsa.jrc) + subChunkPos) - - shiftRightAmount := uint64(rsa.jrBitsPerSubChunk - bitOffset) + c := 0 + for j := 0; j < i; j++ { + if rsa.bv.getBit(j) { + c++ + } + } if val { - remaining := bitSet >> shiftRightAmount - return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount64(remaining) + return c } - remaining := ^bitSet >> shiftRightAmount - - // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 - return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) + return i - c + // if i > rsa.bv.len()-1 { + // if val { + // return rsa.totalOnesRank + // } + // return rsa.bv.len() - rsa.totalOnesRank + // } + // + // chunkPos := (i / rsa.jrBitsPerChunk) + // chunk := rsa.jrc[chunkPos] + // + // subChunkPos := (i % rsa.jrBitsPerChunk) / rsa.jrBitsPerSubChunk + // subChunk := chunk.subChunks[subChunkPos] + // + // bitOffset := i % rsa.jrBitsPerSubChunk + // + // bitSet := rsa.bv.getBitSet(chunkPos*len(rsa.jrc) + subChunkPos) + // + // shiftRightAmount := uint64(rsa.jrBitsPerSubChunk - bitOffset) + // if val { + // remaining := bitSet >> shiftRightAmount + // return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount64(remaining) + // } + // remaining := ^bitSet >> shiftRightAmount + // + // // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 + // return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) } func (rsa rsaBitVector) Select(val bool, rank int) (i int, ok bool) { @@ -74,10 +93,11 @@ type subChunk struct { } // TODO: talk about easy to read instead vs perf -func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk int) { +func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk, totalRank int) { // TODO: talk about why this is probably good enough, improves as n grows, gets worse as n gets smaller, and how this fits into a machine instruction, and how this is "simple" numOfSubChunksPerChunk = 4 + totalRank = 0 chunkCumulativeRank := 0 subChunkCumulativeRank := 0 @@ -100,6 +120,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun onesCount := bits.OnesCount64(inBv.getBitSet(i)) subChunkCumulativeRank += onesCount + totalRank += onesCount } if currSubChunks != nil { @@ -109,7 +130,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun }) } - return jacobsonRankChunks, numOfSubChunksPerChunk * wordSize, wordSize + return jacobsonRankChunks, numOfSubChunksPerChunk * wordSize, wordSize, totalRank } // TODO: talk about how this could be improved memory wise. Talk about how clarks select exists, but keeping it "simple for now" but maybe worth diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index 75cdd672e..0a8f98809 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -1,6 +1,8 @@ package bwt -import "testing" +import ( + "testing" +) type rsaRankTestCase struct { val bool @@ -15,14 +17,11 @@ func TestRSARank_singlePartialChunk(t *testing.T) { bitsToTruncate := 22 initialNumberOfBits := wordSize*2 - bitsToTruncate - bv := newBitVector(initialNumberOfBits) - bv.bits = []uint64{ + rsa := newTestRSAFromWords(initialNumberOfBits, 0xffffffff00000000, 0x00000000ffc00000, - } - - rsa := newRSABitVectorFromBitVector(bv) + ) testCases := []rsaRankTestCase{ {true, 0, 0}, {false, 0, 0}, @@ -43,22 +42,61 @@ func TestRSARank_singlePartialChunk(t *testing.T) { } -func TestRSARank_singleCompleteChunk(t *testing.T) { - if wordSize != 64 { - t.Skip() +func TestRSARank_singleCompleteChunk_PastBounds_Ones(t *testing.T) { + rsa := newTestRSAFromWords(64*4, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + ) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + {true, 255, 127}, {false, 255, 128}, + {true, 256, 128}, {false, 256, 128}, } + for _, tc := range testCases { + rank := rsa.Rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} + +func TestRSARank_singleCompleteChunk_PastBounds_Zeros(t *testing.T) { + rsa := newTestRSAFromWords(64*4, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + ) + + testCases := []rsaRankTestCase{ + {true, 0, 0}, {false, 0, 0}, + {true, 255, 128}, {false, 255, 127}, + {true, 256, 128}, {false, 256, 128}, + } + + for _, tc := range testCases { + rank := rsa.Rank(tc.val, tc.bitPosition) + if rank != tc.expectedRank { + t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) + } + } + +} + +func TestRSARank_singleCompleteChunk(t *testing.T) { initialNumberOfBits := wordSize * 4 - bv := newBitVector(initialNumberOfBits) - bv.bits = []uint64{ + rsa := newTestRSAFromWords(initialNumberOfBits, 0x8000000000000001, 0xff0f30fffacea80d, 0x90e0a0e0b0e0cf0c, 0x3d0f064f7206f717, - } - - rsa := newRSABitVectorFromBitVector(bv) + ) testCases := []rsaRankTestCase{ {true, 0, 0}, {false, 0, 0}, @@ -128,9 +166,7 @@ func TestRSARank_singleCompleteChunk(t *testing.T) { func TestRSARank_multipleChunks(t *testing.T) { numBitsToTruncate := 17 initialNumberOfBits := wordSize*15 - numBitsToTruncate - bv := newBitVector(initialNumberOfBits) - - bv.bits = []uint64{ + rsa := newTestRSAFromWords(initialNumberOfBits, 0x0000000000000000, 0xffffffffffffffff, 0x0000000000000000, @@ -149,9 +185,7 @@ func TestRSARank_multipleChunks(t *testing.T) { 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff, // this should end up getting truncated - } - - rsa := newRSABitVectorFromBitVector(bv) + ) testCases := []rsaRankTestCase{ {true, 0, 0}, {false, 0, 0}, @@ -194,16 +228,12 @@ type rsaSelectTestCase struct { func TestRSASelect(t *testing.T) { bitsToTruncate := 17 initialNumberOfBits := wordSize*4 - bitsToTruncate - bv := newBitVector(initialNumberOfBits) - - bv.bits = []uint64{ + rsa := newTestRSAFromWords(initialNumberOfBits, 0x8010000000010000, 0xfff1ffffffffffff, 0x0000010000000000, 0xffffffffffffffff, - } - - rsa := newRSABitVectorFromBitVector(bv) + ) testCases := []rsaSelectTestCase{ {true, 0, 0}, @@ -253,16 +283,12 @@ func TestRSASelect(t *testing.T) { func TestRSASelect_notOk(t *testing.T) { bitsToTruncate := 17 initialNumberOfBits := wordSize*4 - bitsToTruncate - bv := newBitVector(initialNumberOfBits) - - bv.bits = []uint64{ + rsa := newTestRSAFromWords(initialNumberOfBits, 0x8010000000010000, 0xfff1ffffffffffff, 0x0000010000000000, 0xffffffffffffffff, - } - - rsa := newRSABitVectorFromBitVector(bv) + ) if _, ok := rsa.Select(true, -1); ok { t.Fatalf("expected select(true, -1) to be not ok but somehow returned a value") @@ -281,3 +307,19 @@ func TestRSASelect_notOk(t *testing.T) { t.Fatalf("expected select(true, 239) to be not ok but somehow returned a value") } } + +func newTestRSAFromWords(sizeInBits int, wordsToCopy ...uint64) rsaBitVector { + bv := newBitVector(sizeInBits) + for i := 0; i < len(wordsToCopy); i++ { + w := wordsToCopy[i] + for j := 0; j < 64; j++ { + if i*64+j == sizeInBits { + break + } + mask := uint64(1) << uint64(63-j%64) + bit := w&mask != 0 + bv.setBit(i*64+j, bit) + } + } + return newRSABitVectorFromBitVector(bv) +} diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index 76fd4e6bc..3bae6a0bf 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -1,6 +1,10 @@ package bwt -import "testing" +import ( + "fmt" + "strings" + "testing" +) type WaveletTreeAccessTestCase struct { pos int @@ -69,7 +73,7 @@ type WaveletTreeRankTestCase struct { expected int } -func TestWaveletTree_Rank(t *testing.T) { +func TestWaveletTree_Rank_Genomic(t *testing.T) { testStr := "AAAACCCCTTTTGGGG" + "ACTG" + "TGCA" + "TTAA" + "CCGG" + "GGGGTTTTCCCCAAAA" wt := NewWaveletTreeFromString(testStr) @@ -165,3 +169,26 @@ func TestWaveletTree_Select(t *testing.T) { } } } + +func TestWaveletTree_Access_Reconstruction(t *testing.T) { + enhancedQuickBrownFox := "the quick brown fox jumps over the lazy dog with an overt frown after fumbling its parallelogram shaped bananagram all around downtown" + enhancedQuickBrownFoxRepeated := strings.Join([]string{enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox}, " ") + + testCases := []string{ + "the quick brown fox jumped over the lazy dog", + enhancedQuickBrownFox, + enhancedQuickBrownFoxRepeated, + } + + for _, str := range testCases { + wt := NewWaveletTreeFromString(str) + fmt.Println(len(str)) + actual := "" + for i := 0; i < len(str); i++ { + actual += string(wt.Access(i)) + } + if actual != str { + t.Fatalf("expected to rebuild:\n%s\nbut instead got:\n%s", str, actual) + } + } +} From 7468d73fa46f90526e3b3f59720bf6a226c9b017 Mon Sep 17 00:00:00 2001 From: Trenton Date: Tue, 5 Dec 2023 23:59:13 -0500 Subject: [PATCH 15/60] rsa fixes and refactors --- bwt/rsa_bitvector.go | 88 ++++++++++++++++++--------------------- bwt/rsa_bitvector_test.go | 51 ++++++++++++++++------- bwt/wavelet_test.go | 2 - 3 files changed, 76 insertions(+), 65 deletions(-) diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 5212fbbe7..c33374b39 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -5,68 +5,60 @@ import "math/bits" // TODO: doc what rsa is, why these DSAs, and why we take in a bit vector // TODO: clarks select type rsaBitVector struct { - bv bitvector - totalOnesRank int - jrc []chunk - jrBitsPerChunk int - jrBitsPerSubChunk int - oneSelectMap map[int]int - zeroSelectMap map[int]int + bv bitvector + totalOnesRank int + jrc []chunk + jrSubChunksPerChunk int + jrBitsPerChunk int + jrBitsPerSubChunk int + oneSelectMap map[int]int + zeroSelectMap map[int]int } // TODO: talk about why bv should never be modidifed after building the RSA bit vector func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { - jacobsonRankChunks, jrBitsPerChunk, jrBitsPerSubChunk, totalOnesRank := buildJacobsonRank(bv) + jacobsonRankChunks, jrSubChunksPerChunk, jrBitsPerSubChunk, totalOnesRank := buildJacobsonRank(bv) ones, zeros := buildSelectMaps(bv) return rsaBitVector{ - bv: bv, - totalOnesRank: totalOnesRank, - jrc: jacobsonRankChunks, - jrBitsPerChunk: jrBitsPerChunk, - jrBitsPerSubChunk: jrBitsPerSubChunk, - oneSelectMap: ones, - zeroSelectMap: zeros, + bv: bv, + totalOnesRank: totalOnesRank, + jrc: jacobsonRankChunks, + jrSubChunksPerChunk: jrSubChunksPerChunk, + jrBitsPerChunk: jrSubChunksPerChunk * jrBitsPerSubChunk, + jrBitsPerSubChunk: jrBitsPerSubChunk, + oneSelectMap: ones, + zeroSelectMap: zeros, } } func (rsa rsaBitVector) Rank(val bool, i int) int { - c := 0 - for j := 0; j < i; j++ { - if rsa.bv.getBit(j) { - c++ + if i > rsa.bv.len()-1 { + if val { + return rsa.totalOnesRank } + return rsa.bv.len() - rsa.totalOnesRank } + + chunkPos := (i / rsa.jrBitsPerChunk) + chunk := rsa.jrc[chunkPos] + + subChunkPos := (i % rsa.jrBitsPerChunk) / rsa.jrBitsPerSubChunk + subChunk := chunk.subChunks[subChunkPos] + + bitOffset := i % rsa.jrBitsPerSubChunk + + bitSet := rsa.bv.getBitSet(chunkPos*rsa.jrSubChunksPerChunk + subChunkPos) + + shiftRightAmount := uint64(rsa.jrBitsPerSubChunk - bitOffset) if val { - return c + remaining := bitSet >> shiftRightAmount + return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount64(remaining) } - return i - c - // if i > rsa.bv.len()-1 { - // if val { - // return rsa.totalOnesRank - // } - // return rsa.bv.len() - rsa.totalOnesRank - // } - // - // chunkPos := (i / rsa.jrBitsPerChunk) - // chunk := rsa.jrc[chunkPos] - // - // subChunkPos := (i % rsa.jrBitsPerChunk) / rsa.jrBitsPerSubChunk - // subChunk := chunk.subChunks[subChunkPos] - // - // bitOffset := i % rsa.jrBitsPerSubChunk - // - // bitSet := rsa.bv.getBitSet(chunkPos*len(rsa.jrc) + subChunkPos) - // - // shiftRightAmount := uint64(rsa.jrBitsPerSubChunk - bitOffset) - // if val { - // remaining := bitSet >> shiftRightAmount - // return chunk.onesCumulativeRank + subChunk.onesCumulativeRank + bits.OnesCount64(remaining) - // } - // remaining := ^bitSet >> shiftRightAmount - // - // // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 - // return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) + remaining := ^bitSet >> shiftRightAmount + + // cumulative ranks for 0 should just be the sum of the compliment of cumulative ranks for 1 + return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) } func (rsa rsaBitVector) Select(val bool, rank int) (i int, ok bool) { @@ -130,7 +122,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun }) } - return jacobsonRankChunks, numOfSubChunksPerChunk * wordSize, wordSize, totalRank + return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize, totalRank } // TODO: talk about how this could be improved memory wise. Talk about how clarks select exists, but keeping it "simple for now" but maybe worth diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index 0a8f98809..a2c55b7b3 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -164,27 +164,50 @@ func TestRSARank_singleCompleteChunk(t *testing.T) { } func TestRSARank_multipleChunks(t *testing.T) { - numBitsToTruncate := 17 - initialNumberOfBits := wordSize*15 - numBitsToTruncate - rsa := newTestRSAFromWords(initialNumberOfBits, + rsa := newTestRSAFromWords((8*4+3)*64, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, 0x0000000000000000, + 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff, + 0x0000000000000000, 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff, 0x0000000000000000, + 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff, 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, + 0xffffffffffffffff, + 0x0000000000000000, 0xffffffffffffffff, 0x0000000000000000, - 0xffffffffffffffff, // this should end up getting truncated + 0xffffffffffffffff, ) testCases := []rsaRankTestCase{ @@ -207,7 +230,10 @@ func TestRSARank_multipleChunks(t *testing.T) { {true, 832, 448}, {false, 832, 384}, {true, 896, 448}, {false, 896, 448}, - {true, 896 + wordSize - numBitsToTruncate - 1, 448 + wordSize - numBitsToTruncate - 1}, {false, 896 + wordSize - numBitsToTruncate - 1, 448}, + + {true, 1024, 512}, {false, 1024, 512}, + + {true, 2048, 1024}, {false, 2048, 1024}, } for _, tc := range testCases { @@ -310,16 +336,11 @@ func TestRSASelect_notOk(t *testing.T) { func newTestRSAFromWords(sizeInBits int, wordsToCopy ...uint64) rsaBitVector { bv := newBitVector(sizeInBits) - for i := 0; i < len(wordsToCopy); i++ { - w := wordsToCopy[i] - for j := 0; j < 64; j++ { - if i*64+j == sizeInBits { - break - } - mask := uint64(1) << uint64(63-j%64) - bit := w&mask != 0 - bv.setBit(i*64+j, bit) - } + for i := 0; i < sizeInBits; i++ { + w := wordsToCopy[i/64] + mask := uint64(1) << uint64(63-i%64) + bit := w&mask != 0 + bv.setBit(i, bit) } return newRSABitVectorFromBitVector(bv) } diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index 3bae6a0bf..f027809f7 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -1,7 +1,6 @@ package bwt import ( - "fmt" "strings" "testing" ) @@ -182,7 +181,6 @@ func TestWaveletTree_Access_Reconstruction(t *testing.T) { for _, str := range testCases { wt := NewWaveletTreeFromString(str) - fmt.Println(len(str)) actual := "" for i := 0; i < len(str); i++ { actual += string(wt.Access(i)) From 2d09fa9ef66e139f65c491ff1d95075913f593fa Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 6 Dec 2023 01:11:01 -0500 Subject: [PATCH 16/60] bwt locate --- bwt/bwt.go | 39 +++++++++++++++++++++++++++++++++------ bwt/bwt_test.go | 49 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 77 insertions(+), 11 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 519180e57..45875b055 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -13,25 +13,48 @@ const nullChar = "$" // allows for sub sequence querying. type BWT struct { skipList []skipEntry - l waveletTree + // TODO: talk about how we would want to remove this in favor of a RLFM and or r-index + l waveletTree + // TODO: Talk about how we can cut way down on memory usage by sampling this in a specific way with the RLFM and or r-index + suffixArray []int } func (bwt BWT) Count(pattern string) int { + searchRange := bwt.lfSearch(pattern) + return searchRange.end - searchRange.start +} + +func (bwt BWT) Locate(pattern string) []int { + searchRange := bwt.lfSearch(pattern) + if searchRange.start >= searchRange.end { + return nil + } + + numOfOffsets := searchRange.end - searchRange.start + offsets := make([]int, numOfOffsets) + for i := 0; i < numOfOffsets; i++ { + offsets[i] = bwt.suffixArray[searchRange.start+i] + } + + return offsets +} + +func (bwt BWT) lfSearch(pattern string) interval { searchRange := interval{start: 0, end: bwt.getLenOfOriginalString()} for i := 0; i < len(pattern); i++ { if searchRange.end-searchRange.start <= 0 { - return 0 + return interval{} } c := pattern[len(pattern)-1-i] skip, ok := bwt.lookupSkip(c) if !ok { - return 0 + return interval{} } searchRange.start = skip.openEndedInterval.start + bwt.l.Rank(c, searchRange.start) searchRange.end = skip.openEndedInterval.start + bwt.l.Rank(c, searchRange.end) } - return searchRange.end - searchRange.start + return searchRange } func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { @@ -67,10 +90,13 @@ func New(sequence string) BWT { slices.Sort(prefixArray) + suffixArray := make([]int, len(sequence)) lastColBuilder := strings.Builder{} for i := 0; i < len(prefixArray); i++ { currChar := sequence[getBWTIndex(len(sequence), len(prefixArray[i]))] lastColBuilder.WriteByte(currChar) + + suffixArray[i] = len(sequence) - len(prefixArray[i]) } fb := strings.Builder{} for i := 0; i < len(prefixArray); i++ { @@ -78,8 +104,9 @@ func New(sequence string) BWT { } return BWT{ - skipList: buildSkipList(prefixArray), - l: NewWaveletTreeFromString(lastColBuilder.String()), + skipList: buildSkipList(prefixArray), + l: NewWaveletTreeFromString(lastColBuilder.String()), + suffixArray: suffixArray, } } diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index d9039bd87..fd3d2cda7 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -3,6 +3,8 @@ package bwt import ( "strings" "testing" + + "golang.org/x/exp/slices" ) type BWTCountTestCase struct { @@ -10,12 +12,11 @@ type BWTCountTestCase struct { expected int } -const augmentedQuickBrownFoxTest = "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" - -var threeAugmentedQuickBrownFoxTest = strings.Join([]string{augmentedQuickBrownFoxTest, augmentedQuickBrownFoxTest, augmentedQuickBrownFoxTest}, "") - func TestBWT_Count(t *testing.T) { - bwt := New(threeAugmentedQuickBrownFoxTest) + baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" + testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") + + bwt := New(testStr) testTable := []BWTCountTestCase{ {"uick", 3}, @@ -38,6 +39,44 @@ func TestBWT_Count(t *testing.T) { } } +type BWTLocateTestCase struct { + seq string + expected []int +} + +func TestBWT_Locate(t *testing.T) { + baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" // len == 112 + testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") + + bwt := New(testStr) + + testTable := []BWTLocateTestCase{ + {"uick", []int{4, 117, 230}}, + {"the", []int{0, 25, 113, 138, 226, 251}}, + {"over", []int{21, 41, 134, 154, 247, 267}}, + {"own", []int{10, 48, 106, 110, 123, 161, 219, 223, 236, 274, 332, 336}}, + {"ana", []int{87, 89, 200, 202, 313, 315}}, + {"an", []int{39, 87, 89, 152, 200, 202, 265, 313, 315}}, + {"na", []int{50, 88, 90, 163, 201, 203, 276, 314, 316}}, + {"rown", []int{9, 47, 122, 160, 235, 273}}, + {"townthe", []int{109, 222}}, + {"zzz", nil}, + } + + for _, v := range testTable { + offsets := bwt.Locate(v.seq) + slices.Sort(offsets) + if len(offsets) != len(v.expected) { + t.Fatalf("seq=%s expectedOffsets=%v actualOffsets=%v", v.seq, v.expected, offsets) + } + for i := range offsets { + if offsets[i] != v.expected[i] { + t.Fatalf("seq=%s expectedOffsets=%v actualOffsets=%v", v.seq, v.expected, offsets) + } + } + } +} + func BenchmarkBWTBuildPower12(b *testing.B) { base := "!BANANA!" BaseBenchmarkBWTBuild(base, 12, b) From bb3cf93479773cb3660695c74746fef739714b49 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 6 Dec 2023 01:46:41 -0500 Subject: [PATCH 17/60] extract --- bwt/bwt.go | 37 +++++++++++++++++-- bwt/bwt_test.go | 96 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 105 insertions(+), 28 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 45875b055..c17716d66 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -39,6 +39,30 @@ func (bwt BWT) Locate(pattern string) []int { return offsets } +// TODO: do we want to ignore the $? +func (bwt BWT) Extract(start, end int) string { + if end > bwt.getLenOfOriginalString() { + panic("figure out what we want to do here") + } + + strB := strings.Builder{} + for i := start; i < end; i++ { + fPos := bwt.reverseCharacterLookup(i) + skip := bwt.lookupSkipByOffset(fPos) + strB.WriteByte(skip.char) + } + return strB.String() +} + +func (bwt BWT) reverseCharacterLookup(originalPos int) int { + for i := range bwt.suffixArray { + if bwt.suffixArray[i] == originalPos { + return i + } + } + panic("figure out what to do here") +} + func (bwt BWT) lfSearch(pattern string) interval { searchRange := interval{start: 0, end: bwt.getLenOfOriginalString()} for i := 0; i < len(pattern); i++ { @@ -47,7 +71,7 @@ func (bwt BWT) lfSearch(pattern string) interval { } c := pattern[len(pattern)-1-i] - skip, ok := bwt.lookupSkip(c) + skip, ok := bwt.lookupSkipByChar(c) if !ok { return interval{} } @@ -57,7 +81,7 @@ func (bwt BWT) lfSearch(pattern string) interval { return searchRange } -func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { +func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) { for i := range bwt.skipList { if bwt.skipList[i].char == c { return bwt.skipList[i], true @@ -66,6 +90,15 @@ func (bwt BWT) lookupSkip(c byte) (entry skipEntry, ok bool) { return skipEntry{}, false } +func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { + for i := range bwt.skipList { + if bwt.skipList[i].openEndedInterval.start <= offset && offset < bwt.skipList[i].openEndedInterval.end { + return bwt.skipList[i] + } + } + panic("figure out what to do here") +} + func (bwt BWT) getLenOfOriginalString() int { return bwt.skipList[len(bwt.skipList)-1].openEndedInterval.end } diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index fd3d2cda7..ca58becad 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -77,36 +77,80 @@ func TestBWT_Locate(t *testing.T) { } } -func BenchmarkBWTBuildPower12(b *testing.B) { - base := "!BANANA!" - BaseBenchmarkBWTBuild(base, 12, b) +type BWTExtractTestCase struct { + start int + end int + expected string } -//go:noinline -func BaseBenchmarkBWTBuild(base string, power int, b *testing.B) { - for n := 0; n < b.N; n++ { - buildBWTForBench(base, power) - } -} - -func buildBWTForBench(base string, power int) BWT { - test := base - for i := 0; i < power; i++ { - test += test - } +func TestBWT_Extract(t *testing.T) { + baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" // len == 112 + testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") - return New(test) -} + bwt := New(testStr) -func BenchmarkBWTQueryPower12(b *testing.B) { - base := "!BANANA!" - bwt := buildBWTForBench(base, 12) - BaseBenchmarkBWTQuery(bwt, "ANANABANANA", b) -} + testTable := []BWTExtractTestCase{ + {4, 8, "uick"}, + {117, 121, "uick"}, + {230, 234, "uick"}, + {0, 3, "the"}, + {25, 28, "the"}, + {113, 116, "the"}, + {138, 141, "the"}, + {226, 229, "the"}, + {251, 254, "the"}, + {21, 25, "over"}, + {41, 45, "over"}, + {134, 138, "over"}, + {154, 158, "over"}, + {247, 251, "over"}, + {267, 271, "over"}, + {10, 13, "own"}, + {48, 51, "own"}, + {106, 109, "own"}, + {123, 126, "own"}, + {161, 164, "own"}, + {219, 222, "own"}, + {223, 226, "own"}, + {236, 239, "own"}, + {274, 277, "own"}, + {332, 335, "own"}, + {336, 339, "own"}, + {87, 90, "ana"}, + {89, 92, "ana"}, + {200, 203, "ana"}, + {202, 205, "ana"}, + {313, 316, "ana"}, + {315, 318, "ana"}, + {39, 41, "an"}, + {87, 89, "an"}, + {152, 154, "an"}, + {200, 202, "an"}, + {202, 204, "an"}, + {265, 267, "an"}, + {313, 315, "an"}, + {50, 52, "na"}, + {88, 90, "na"}, + {163, 165, "na"}, + {201, 203, "na"}, + {203, 205, "na"}, + {276, 278, "na"}, + {314, 316, "na"}, + {316, 318, "na"}, + {9, 13, "rown"}, + {47, 51, "rown"}, + {122, 126, "rown"}, + {160, 164, "rown"}, + {235, 239, "rown"}, + {273, 277, "rown"}, + {109, 116, "townthe"}, + {222, 229, "townthe"}, + } -//go:noinline -func BaseBenchmarkBWTQuery(bwt BWT, seq string, b *testing.B) { - for n := 0; n < b.N; n++ { - bwt.Count(seq) + for _, v := range testTable { + str := bwt.Extract(v.start, v.end) + if str != v.expected { + t.Fatalf("extractRange=(%d, %d) expected=%s actual=%s", v.start, v.end, v.expected, str) + } } } From 249867aedb7a67ca715930735fa5d2b466efc7f0 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 6 Dec 2023 02:14:26 -0500 Subject: [PATCH 18/60] add 1 more test for reconstruction --- bwt/wavelet_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index f027809f7..e6d00745c 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -172,11 +172,16 @@ func TestWaveletTree_Select(t *testing.T) { func TestWaveletTree_Access_Reconstruction(t *testing.T) { enhancedQuickBrownFox := "the quick brown fox jumps over the lazy dog with an overt frown after fumbling its parallelogram shaped bananagram all around downtown" enhancedQuickBrownFoxRepeated := strings.Join([]string{enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox}, " ") + enhancedQuickBrownFoxSuperLarge := "" + for i := 0; i < 100; i++ { + enhancedQuickBrownFoxSuperLarge += enhancedQuickBrownFoxRepeated + } testCases := []string{ "the quick brown fox jumped over the lazy dog", enhancedQuickBrownFox, enhancedQuickBrownFoxRepeated, + enhancedQuickBrownFoxSuperLarge, } for _, str := range testCases { From 9aea859c633a6af6e0438d133282c3758ff1af32 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 6 Dec 2023 23:09:21 -0500 Subject: [PATCH 19/60] doc BWT, refactor, and return a possible error during construction --- bwt/bwt.go | 114 +++++++++++++++++++++++++++++++++++++----------- bwt/bwt_test.go | 15 +++++-- 2 files changed, 100 insertions(+), 29 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index c17716d66..30810b4b5 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -1,6 +1,7 @@ package bwt import ( + "fmt" "strings" "golang.org/x/exp/slices" @@ -9,21 +10,43 @@ import ( const nullChar = "$" // BWT Burrow Wheeler Transform -// Data structure that compactly represents any sequence of characters and -// allows for sub sequence querying. +// Compresses and Indexes a given sequence so that it can be +// be used for search, alignment, and text extraction. This is +// useful for sequences so large that it would be beneficial +// to reduce its memory footprint while also maintaining a way +// to analyze and work with the sequence. type BWT struct { - skipList []skipEntry - // TODO: talk about how we would want to remove this in favor of a RLFM and or r-index - l waveletTree - // TODO: Talk about how we can cut way down on memory usage by sampling this in a specific way with the RLFM and or r-index + // firstColumnSkipList is the first column of the BWT. It is + // represented as a list of skipEntries because the first column of + // the BWT is always lexographically ordered. This saves time and memory. + firstColumnSkipList []skipEntry + // lastCoulmn last column of the BWT- the actual textual representation + // of the BWT. + lastCoulmn waveletTree + // suffixArray an array that allows us to map a posistion in the first + // column to a position in the original sequence. This is needed to be + // able to extract text from the BWT. suffixArray []int } +// Count represents the number of times the provided pattern +// shows up in the original sequence. func (bwt BWT) Count(pattern string) int { searchRange := bwt.lfSearch(pattern) return searchRange.end - searchRange.start } +// Locate returns a list of offsets at which the begging +// of the provided pattern occurrs in the original +// sequence. +// TODO: these are the offsets of the BWT where we added +// the nullChar. So these offsets don't acutally pertain +// to the original string, but the indexed BWT string. Do +// we just want to call that out in documentation? If the user +// wants to extract that pattern later on as if it were from +// the origianl unmodified sequence, then the offsets would be +// off by one. With that beign said, maybe they should be using +// Extract to do that? func (bwt BWT) Locate(pattern string) []int { searchRange := bwt.lfSearch(pattern) if searchRange.start >= searchRange.end { @@ -39,30 +62,44 @@ func (bwt BWT) Locate(pattern string) []int { return offsets } -// TODO: do we want to ignore the $? +// Extract this allows us to extract parts of the original +// sequence from the BWT. +// start is the begging of the range of text to extract inclusive. +// end is the end of the range of text to extract exclusive. +// If either start or end are out of bounds, Extract will panic. func (bwt BWT) Extract(start, end int) string { if end > bwt.getLenOfOriginalString() { - panic("figure out what we want to do here") + msg := fmt.Sprintf("end [%d] exceeds the max range of the BWT [%d]", end, bwt.getLenOfOriginalString()) + panic(msg) + } + if start < 0 { + msg := fmt.Sprintf("start [%d] exceeds the min range of the BWT [0]", start) + panic(msg) } strB := strings.Builder{} for i := start; i < end; i++ { - fPos := bwt.reverseCharacterLookup(i) + fPos := bwt.getFCharPosFromOriginalSequenceCharPos(i) skip := bwt.lookupSkipByOffset(fPos) strB.WriteByte(skip.char) } return strB.String() } -func (bwt BWT) reverseCharacterLookup(originalPos int) int { +// getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position +// of the sequence to its corresponding posisiton in the First Column of the BWT +func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int { for i := range bwt.suffixArray { if bwt.suffixArray[i] == originalPos { return i } } - panic("figure out what to do here") + panic("Unable to find the corresponding orginal positiong for a character in the original sequence in the suffix array. This should not be possible and indicates a malformed BWT.") } +// lfSearch LF Search- Last First Search. +// Finds the valid range within the BWT index where the provided pattern is possible. +// If the final range is <= 0, then the pattern does not exist in the original sequence. func (bwt BWT) lfSearch(pattern string) interval { searchRange := interval{start: 0, end: bwt.getLenOfOriginalString()} for i := 0; i < len(pattern); i++ { @@ -75,32 +112,44 @@ func (bwt BWT) lfSearch(pattern string) interval { if !ok { return interval{} } - searchRange.start = skip.openEndedInterval.start + bwt.l.Rank(c, searchRange.start) - searchRange.end = skip.openEndedInterval.start + bwt.l.Rank(c, searchRange.end) + searchRange.start = skip.openEndedInterval.start + bwt.lastCoulmn.Rank(c, searchRange.start) + searchRange.end = skip.openEndedInterval.start + bwt.lastCoulmn.Rank(c, searchRange.end) } return searchRange } +// lookupSkipByChar looks up a skipEntry by its character in the First Coulmn func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) { - for i := range bwt.skipList { - if bwt.skipList[i].char == c { - return bwt.skipList[i], true + for i := range bwt.firstColumnSkipList { + if bwt.firstColumnSkipList[i].char == c { + return bwt.firstColumnSkipList[i], true } } return skipEntry{}, false } +// lookupSkipByOffset looks up a skipEntry based off of an +// offset of the Fist Coulmn of the BWT. func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { - for i := range bwt.skipList { - if bwt.skipList[i].openEndedInterval.start <= offset && offset < bwt.skipList[i].openEndedInterval.end { - return bwt.skipList[i] + if offset > bwt.getLenOfOriginalString()-1 { + msg := fmt.Sprintf("offset [%d] exceeds the max bound of the BWT [%d]", offset, bwt.getLenOfOriginalString()-1) + panic(msg) + } + if offset < 0 { + msg := fmt.Sprintf("offset [%d] exceeds the min bound of the BWT [0]", offset) + panic(msg) + } + + for i := range bwt.firstColumnSkipList { + if bwt.firstColumnSkipList[i].openEndedInterval.start <= offset && offset < bwt.firstColumnSkipList[i].openEndedInterval.end { + return bwt.firstColumnSkipList[i] } } panic("figure out what to do here") } func (bwt BWT) getLenOfOriginalString() int { - return bwt.skipList[len(bwt.skipList)-1].openEndedInterval.end + return bwt.firstColumnSkipList[len(bwt.firstColumnSkipList)-1].openEndedInterval.end } type interval struct { @@ -109,11 +158,20 @@ type interval struct { } type skipEntry struct { - char byte + char byte + // openEndedInterval start is inclusive and end is exclusive openEndedInterval interval } -func New(sequence string) BWT { +// New returns a BWT of the provided sequence +// The provided sequence must not contain the nullChar +// defined in this package. If it does, New will return +// an error. +func New(sequence string) (BWT, error) { + if strings.Contains(sequence, nullChar) { + return BWT{}, fmt.Errorf("Provided sequence contains the nullChar %s. BWT cannot be constructed", nullChar) + } + sequence += nullChar prefixArray := make([]string, len(sequence)) @@ -137,12 +195,13 @@ func New(sequence string) BWT { } return BWT{ - skipList: buildSkipList(prefixArray), - l: NewWaveletTreeFromString(lastColBuilder.String()), - suffixArray: suffixArray, - } + firstColumnSkipList: buildSkipList(prefixArray), + lastCoulmn: NewWaveletTreeFromString(lastColBuilder.String()), + suffixArray: suffixArray, + }, nil } +// buildSkipList compressed the First Column of the BWT into a skip list func buildSkipList(prefixArray []string) []skipEntry { prevChar := prefixArray[0][0] skipList := []skipEntry{{char: prevChar, openEndedInterval: interval{start: 0}}} @@ -161,6 +220,9 @@ func buildSkipList(prefixArray []string) []skipEntry { return skipList } +// getBWTIndex returns the position of the character from the sequence used to build the BWT +// that corresponds the last character that would exist in the entry of the prefixArray that +// would be the last character if we were actually doing full rotations func getBWTIndex(lenOfSequenceBeingBuilt, lenOfSuffixArrayVisited int) int { bwtCharIndex := lenOfSequenceBeingBuilt - lenOfSuffixArrayVisited - 1 if bwtCharIndex == -1 { diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index ca58becad..d87242386 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -16,7 +16,10 @@ func TestBWT_Count(t *testing.T) { baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") - bwt := New(testStr) + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } testTable := []BWTCountTestCase{ {"uick", 3}, @@ -48,7 +51,10 @@ func TestBWT_Locate(t *testing.T) { baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" // len == 112 testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") - bwt := New(testStr) + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } testTable := []BWTLocateTestCase{ {"uick", []int{4, 117, 230}}, @@ -87,7 +93,10 @@ func TestBWT_Extract(t *testing.T) { baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" // len == 112 testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") - bwt := New(testStr) + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } testTable := []BWTExtractTestCase{ {4, 8, "uick"}, From 38664345feb463a9af870c342221141bf6696cb3 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 6 Dec 2023 23:17:04 -0500 Subject: [PATCH 20/60] add TODO about sorting and the nullChar --- bwt/bwt.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 30810b4b5..d20595b09 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -7,7 +7,7 @@ import ( "golang.org/x/exp/slices" ) -const nullChar = "$" +const nullChar = "0" // BWT Burrow Wheeler Transform // Compresses and Indexes a given sequence so that it can be @@ -179,6 +179,9 @@ func New(sequence string) (BWT, error) { prefixArray[i] = sequence[len(sequence)-i-1:] } + // TODO: at the time of writing, the nullChar is 0, this is to ensure correctness in most cases. + // Do we want to roll our own sorting so we can make sure whatever is defined as the nullChar + // will absolutely be defined as the least? slices.Sort(prefixArray) suffixArray := make([]int, len(sequence)) From 154db9538817f0a97d680a00ea87175224bb282b Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 8 Dec 2023 00:15:12 -0500 Subject: [PATCH 21/60] bwt examples, remove TODO that does not matter --- bwt/bwt.go | 8 -------- bwt/bwt_test.go | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index d20595b09..0c5cd03b0 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -39,14 +39,6 @@ func (bwt BWT) Count(pattern string) int { // Locate returns a list of offsets at which the begging // of the provided pattern occurrs in the original // sequence. -// TODO: these are the offsets of the BWT where we added -// the nullChar. So these offsets don't acutally pertain -// to the original string, but the indexed BWT string. Do -// we just want to call that out in documentation? If the user -// wants to extract that pattern later on as if it were from -// the origianl unmodified sequence, then the offsets would be -// off by one. With that beign said, maybe they should be using -// Extract to do that? func (bwt BWT) Locate(pattern string) []int { searchRange := bwt.lfSearch(pattern) if searchRange.start >= searchRange.end { diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index d87242386..58eb86819 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -1,12 +1,26 @@ package bwt import ( + "fmt" + "log" "strings" "testing" "golang.org/x/exp/slices" ) +func ExampleBWT_Count() { + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt, err := New(inputSequence) + if err != nil { + log.Fatal(err) + } + + fmt.Println(bwt.Count("CG")) + // Output: 10 +} + type BWTCountTestCase struct { seq string expected int @@ -42,12 +56,36 @@ func TestBWT_Count(t *testing.T) { } } +func ExampleBWT_Locate() { + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt, err := New(inputSequence) + if err != nil { + log.Fatal(err) + } + + offsets := bwt.Locate("CG") + slices.Sort(offsets) + fmt.Println(offsets) + // Output: [7 10 20 23 25 30 33 38 45 50] +} + type BWTLocateTestCase struct { seq string expected []int } func TestBWT_Locate(t *testing.T) { + + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt2, err := New(inputSequence) + if err != nil { + log.Fatal(err) + } + + offsets := bwt2.Locate("CG") + slices.Sort(offsets) baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" // len == 112 testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") @@ -83,6 +121,18 @@ func TestBWT_Locate(t *testing.T) { } } +func ExampleBWT_Extract() { + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt, err := New(inputSequence) + if err != nil { + log.Fatal(err) + } + + fmt.Println(bwt.Extract(48, 54)) + // Output: AACGTG +} + type BWTExtractTestCase struct { start int end int From ff0fec93aa6ff92fea6aaf314d127f0424ecca64 Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 8 Dec 2023 00:43:06 -0500 Subject: [PATCH 22/60] wavelet tree doc and address todos --- bwt/wavelet.go | 73 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 822695ede..2eef6e49e 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -1,18 +1,21 @@ package bwt import ( + "fmt" "math" "golang.org/x/exp/slices" ) +// waveletTree datastructure that allows us to +// conduct RSA queries on strings. type waveletTree struct { root *node alpha []charInfo } -// TODO: figure out empty nodes case -// TODO: figure out out of bounds case +// Access will return the ith character of the original +// string used to build the waveletTree func (wt waveletTree) Access(i int) byte { curr := wt.root for !curr.isLeaf() { @@ -27,8 +30,8 @@ func (wt waveletTree) Access(i int) byte { return curr.char } -// TODO: deal with bad lookup char -// TODO: deal with somehow bad path +// Rank allows us to get the rank of a specified character in +// the original string func (wt waveletTree) Rank(char byte, i int) int { curr := wt.root ci := wt.lookupCharInfo(char) @@ -48,6 +51,8 @@ func (wt waveletTree) Rank(char byte, i int) int { return rank } +// Select allows us to get the corresponding posisiton of a character +// in the original string given its rank. func (wt waveletTree) Select(char byte, rank int) int { curr := wt.root ci := wt.lookupCharInfo(char) @@ -67,8 +72,11 @@ func (wt waveletTree) Select(char byte, rank int) int { curr = curr.parent level-- pathBit := ci.path.getBit(ci.path.len() - 1 - level) - // TODO: do we really need the ok on the select? - rank, _ = curr.data.Select(pathBit, rank) + rank, ok := curr.data.Select(pathBit, rank) + if !ok { + msg := fmt.Sprintf("could not find a correspodning bit for node.Select(%t, %d) for characterInfo %+v", pathBit, rank, ci) + panic(msg) + } } return rank @@ -80,11 +88,10 @@ func (wt waveletTree) lookupCharInfo(char byte) charInfo { return wt.alpha[i] } } - panic("better messaging or handling") + msg := fmt.Sprintf("could not find character %s in alphabet %+v. this should not be possible and indicates that the WaveletTree is malformed", string(char), wt.alpha) + panic(msg) } -// TODO: talk about how we could probably greaty improve performance with one big bit vector that -// represents the whole tree by concatenation the level order traversal of each node's bits type node struct { data rsaBitVector char byte @@ -168,7 +175,10 @@ func isInAlpha(alpha []charInfo, b byte) bool { return false } -// TODO: talk about arranging OG alpha such that we minimize memory +// partitionAlpha partitions the alaphabet in half based on whether its corresponding path bit +// is a 0 or 1. 0 with comprise the left tree while 1 will comprise the right. The alphabet +// should be sorted in such a way that we remove the most amount of characters nearest to the +// root of the tree to reduce the memory footprint as much as possible. func partitionAlpha(currentLevel int, alpha []charInfo) (left []charInfo, right []charInfo) { for _, a := range alpha { if a.path.getBit(a.path.len() - 1 - currentLevel) { @@ -181,19 +191,14 @@ func partitionAlpha(currentLevel int, alpha []charInfo) (left []charInfo, right return left, right } -func getLeft(nodePos int) int { - return nodePos*2 + 1 -} - -func getRight(nodePos int) int { - return nodePos*2 + 2 -} - -func getParent(nodePos int) int { - return (nodePos + 1) / 2 -} - -// alphabets are expected to be small for real usecases +// getCharInfoDescByRank takes in the bytes of the original +// string and return a sorted list of character metadata descending +// by rank. The character metadata is important for building the rest +// of the tree along with quering it later on. The sorting is important +// because this allows us to build the tree in the most memory efficient +// way since the characters with the greatest counts will be removed first +// before build the subsequent nodes in the lower levels. +// NOTE: alphabets are expected to be small for real usecases func getCharInfoDescByRank(b []byte) []charInfo { ranks := make(map[byte]int) for i := 0; i < len(b); i++ { @@ -226,6 +231,28 @@ func getCharInfoDescByRank(b []byte) []charInfo { return sortedInfo } +// encodeCharPathIntoBitVector important metadata to understand +// which character we are woring with in a given path in the tree. +// For example, given the alphabet A B C D E F G, a possible encoding is: +// A: 000 +// B: 001 +// C: 010 +// D: 011 +// E: 100 +// F: 101 +// G: 110 +// H: 111 +// +// If we wanted to get to the leaf that represent the character D, we'd +// take the path: +// +// root +// / +// left +// \ +// right +// \ +// right func encodeCharPathIntoBitVector(bv bitvector, n uint64) { shift := 0 for n>>shift > 0 { From 85891982387d0e07b67ebf631efe3499b5d7a738 Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 8 Dec 2023 01:18:29 -0500 Subject: [PATCH 23/60] wavelet tree explination --- bwt/wavelet.go | 109 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 22 deletions(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 2eef6e49e..ad25c1980 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -9,6 +9,93 @@ import ( // waveletTree datastructure that allows us to // conduct RSA queries on strings. +// +// For the waveletTree's usage, please read the its +// method documentation. To understand what it is and how +// it works, then read below. +// +// # The WaveletTree has several imporant components +// +// ## The Character's Path Encoding +// +// one important component is a character's path encoding. +// which character we are woring with in a given path in the tree. +// For example, given the alphabet A B C D E F G, a possible encoding is: +// A: 000 +// B: 001 +// C: 010 +// D: 011 +// E: 100 +// F: 101 +// G: 110 +// H: 111 +// +// If we wanted to get to the leaf that represent the character D, we'd +// take the path: +// +// root +// / +// left +// \ +// right +// \ +// right +// +// ## The Data Represented at each node +// +// Let us consider the sequence "bananas" +// It has the alphabet b, a, n, s +// Let's say it has the encoding: +// a: 00 +// n: 01 +// b: 10 +// s: 11 +// and that 0 if left and 1 is right +// We can represent this tree with bitvectors: +// +// 0010101 +// / \ +// 1000 001 +// / \ / \ +// a n b s +// +// If we transalte each bit vector to its corresponding string, then it becomes: +// +// bananas +// / \ +// baaa nns +// / \ / \ +// a b n s +// +// ## RSA Intuition +// +// From here you may be able to build some intuition as to how we can take RSA queries given +// a characters path encoding and which character we'd like to Rank, Select, and Access. +// +// ### Rank Example +// +// To get WaveletTree.Rank(a, 4) of bananas where a's encoding is 00 +// 1. root.Rank(0, 4) of 0010101 is 2 +// 2. Visit Left Child +// 3. child.Rank(0, 2) of 1000 is 1 +// 4. Visit Left Child +// 5. return 1 +// +// ### Select Example +// +// To get WaveletTree.Select(n, 1) of banans where n's encoding is 01 +// 1. Go down to n's leaf using the path encoding is 01 +// 2. Go back to n's leaf's parent +// 3. child.Select(0, 1) of 001 is 1 +// 4. Go to the next parent +// 5. child.Select(1, 1) of 0010101 is 4 +// 6. return 4 since we are at the root. +// +// ### Access Example +// +// If you've reached this point, then you must really be trying to understand how the +// waveletTree works. I recommend thinking through how access could work with the example +// above. HINT: it involved rank. type waveletTree struct { root *node alpha []charInfo @@ -231,28 +318,6 @@ func getCharInfoDescByRank(b []byte) []charInfo { return sortedInfo } -// encodeCharPathIntoBitVector important metadata to understand -// which character we are woring with in a given path in the tree. -// For example, given the alphabet A B C D E F G, a possible encoding is: -// A: 000 -// B: 001 -// C: 010 -// D: 011 -// E: 100 -// F: 101 -// G: 110 -// H: 111 -// -// If we wanted to get to the leaf that represent the character D, we'd -// take the path: -// -// root -// / -// left -// \ -// right -// \ -// right func encodeCharPathIntoBitVector(bv bitvector, n uint64) { shift := 0 for n>>shift > 0 { From 5d63789200305ecc0e51da8f172b6135bb52ef55 Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 8 Dec 2023 01:21:22 -0500 Subject: [PATCH 24/60] doc and note for waveletTree --- bwt/wavelet.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index ad25c1980..e943a3fbf 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -95,7 +95,12 @@ import ( // // If you've reached this point, then you must really be trying to understand how the // waveletTree works. I recommend thinking through how access could work with the example -// above. HINT: it involved rank. +// above. HINT: it involves rank. +// +// NOTE: The waveletTree literlally have to be a tree. There are other forms that it may +// exist in like the concatenation of order level representation of all its node's bitvectors... +// as one example. Please reference the implementation if you'd like to understand how this +// specific waveletTree works type waveletTree struct { root *node alpha []charInfo From 51d8bfd56ca42978289f0478a674858fe5cbb144 Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 8 Dec 2023 01:22:39 -0500 Subject: [PATCH 25/60] typo --- bwt/wavelet.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index e943a3fbf..2a9064d60 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -59,7 +59,7 @@ import ( // / \ / \ // a n b s // -// If we transalte each bit vector to its corresponding string, then it becomes: +// If we translate each bit vector to its corresponding string, then it becomes: // // bananas // / \ From 3a62d7b652e714874fdbe524ac3789ba86c20ece Mon Sep 17 00:00:00 2001 From: Trenton Date: Sat, 9 Dec 2023 01:29:56 -0500 Subject: [PATCH 26/60] extract changes, move around and add to wavelet doc --- bwt/bwt.go | 19 ++++++++++++------- bwt/bwt_test.go | 32 ++++++++++++++++++++++++++++++++ bwt/wavelet.go | 30 +++++++++++++++++++----------- 3 files changed, 63 insertions(+), 18 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 0c5cd03b0..f5dc91f19 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -60,8 +60,8 @@ func (bwt BWT) Locate(pattern string) []int { // end is the end of the range of text to extract exclusive. // If either start or end are out of bounds, Extract will panic. func (bwt BWT) Extract(start, end int) string { - if end > bwt.getLenOfOriginalString() { - msg := fmt.Sprintf("end [%d] exceeds the max range of the BWT [%d]", end, bwt.getLenOfOriginalString()) + if end > bwt.getLenOfOriginalStringWithNullChar()-1 { + msg := fmt.Sprintf("end [%d] exceeds the max range of the BWT [%d]", end, bwt.getLenOfOriginalStringWithNullChar()-1) panic(msg) } if start < 0 { @@ -78,6 +78,11 @@ func (bwt BWT) Extract(start, end int) string { return strB.String() } +// Len return the length of the sequence used to build the BWT +func (bwt BWT) Len() int { + return bwt.getLenOfOriginalStringWithNullChar() - 1 +} + // getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position // of the sequence to its corresponding posisiton in the First Column of the BWT func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int { @@ -86,14 +91,14 @@ func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int { return i } } - panic("Unable to find the corresponding orginal positiong for a character in the original sequence in the suffix array. This should not be possible and indicates a malformed BWT.") + panic("Unable to find the corresponding original position for a character in the original sequence in the suffix array. This should not be possible and indicates a malformed BWT.") } // lfSearch LF Search- Last First Search. // Finds the valid range within the BWT index where the provided pattern is possible. // If the final range is <= 0, then the pattern does not exist in the original sequence. func (bwt BWT) lfSearch(pattern string) interval { - searchRange := interval{start: 0, end: bwt.getLenOfOriginalString()} + searchRange := interval{start: 0, end: bwt.getLenOfOriginalStringWithNullChar()} for i := 0; i < len(pattern); i++ { if searchRange.end-searchRange.start <= 0 { return interval{} @@ -123,8 +128,8 @@ func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) { // lookupSkipByOffset looks up a skipEntry based off of an // offset of the Fist Coulmn of the BWT. func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { - if offset > bwt.getLenOfOriginalString()-1 { - msg := fmt.Sprintf("offset [%d] exceeds the max bound of the BWT [%d]", offset, bwt.getLenOfOriginalString()-1) + if offset > bwt.getLenOfOriginalStringWithNullChar()-1 { + msg := fmt.Sprintf("offset [%d] exceeds the max bound of the BWT [%d]", offset, bwt.getLenOfOriginalStringWithNullChar()-1) panic(msg) } if offset < 0 { @@ -140,7 +145,7 @@ func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { panic("figure out what to do here") } -func (bwt BWT) getLenOfOriginalString() int { +func (bwt BWT) getLenOfOriginalStringWithNullChar() int { return bwt.firstColumnSkipList[len(bwt.firstColumnSkipList)-1].openEndedInterval.end } diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index 58eb86819..3abe3b3b0 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -213,3 +213,35 @@ func TestBWT_Extract(t *testing.T) { } } } + +func TestBWT_Extract_DoNotAllowExtractionOfLastNullChar(t *testing.T) { + defer func() { _ = recover() }() + testStr := "banana" + + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + str := bwt.Extract(0, 6) + if str != testStr { + t.Fatalf("extractRange=(%d, %d) expected=%s actual=%s", 0, 6, testStr, str) + } + + str = bwt.Extract(0, 7) + + t.Fatalf("extractRange=(%d, %d) expected panic so we do not allow access to the null character", 0, 7) +} + +func TestBWT_Len(t *testing.T) { + testStr := "banana" + + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + if bwt.Len() != len(testStr) { + t.Fatalf("expected Len to be %d but got %d", len(testStr), bwt.Len()) + } +} diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 2a9064d60..637096bf1 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -7,19 +7,16 @@ import ( "golang.org/x/exp/slices" ) -// waveletTree datastructure that allows us to -// conduct RSA queries on strings. -// // For the waveletTree's usage, please read the its // method documentation. To understand what it is and how -// it works, then read below. +// it works for either curiosity or maintence, then read below. // // # The WaveletTree has several imporant components // // ## The Character's Path Encoding // // one important component is a character's path encoding. -// which character we are woring with in a given path in the tree. +// which character we are working with in a given path in the tree. // For example, given the alphabet A B C D E F G, a possible encoding is: // A: 000 // B: 001 @@ -69,6 +66,8 @@ import ( // // ## RSA Intuition // +// RSA stands for (R)ank, (S)elect, (A)ccess. +// // From here you may be able to build some intuition as to how we can take RSA queries given // a characters path encoding and which character we'd like to Rank, Select, and Access. // @@ -86,21 +85,30 @@ import ( // To get WaveletTree.Select(n, 1) of banans where n's encoding is 01 // 1. Go down to n's leaf using the path encoding is 01 // 2. Go back to n's leaf's parent -// 3. child.Select(0, 1) of 001 is 1 +// 3. parent.Select(0, 1) of 001 is 1 // 4. Go to the next parent -// 5. child.Select(1, 1) of 0010101 is 4 +// 5. parent.Select(1, 1) of 0010101 is 4 // 6. return 4 since we are at the root. // // ### Access Example // // If you've reached this point, then you must really be trying to understand how the // waveletTree works. I recommend thinking through how access could work with the example -// above. HINT: it involves rank. +// above. HINT: rank might help. // -// NOTE: The waveletTree literlally have to be a tree. There are other forms that it may +// NOTE: The waveletTree does not literally have to be a tree. There are other forms that it may // exist in like the concatenation of order level representation of all its node's bitvectors... // as one example. Please reference the implementation if you'd like to understand how this -// specific waveletTree works +// specific waveletTree works. + +// waveletTree is datastructure that allows us to index a sequence +// in a memory efficient way that allows us to conduct RSA, (R)ank (S)elect (A)ccess +// queries on strings. This is very useful in situations where you'd like to understand +// certain aspects of a sequence like: +// * the number of times a character appears +// * counting how the frequency of a character up to certain offset +// * locating characters of certain rank within the sequence +// * accessing the character at a given position type waveletTree struct { root *node alpha []charInfo @@ -143,7 +151,7 @@ func (wt waveletTree) Rank(char byte, i int) int { return rank } -// Select allows us to get the corresponding posisiton of a character +// Select allows us to get the corresponding position of a character // in the original string given its rank. func (wt waveletTree) Select(char byte, rank int) int { curr := wt.root From f25c8511ab4903b65a3e69b1c752098a0211d17a Mon Sep 17 00:00:00 2001 From: Trenton Date: Mon, 11 Dec 2023 22:41:25 -0500 Subject: [PATCH 27/60] add bwt high level. move wavelet tree's some rsa bv docs --- bwt/bwt.go | 167 +++++++++++++++++++++++++++++++++-- bwt/rsa_bitvector.go | 12 +++ bwt/wavelet.go | 201 ++++++++++++++++++++++--------------------- 3 files changed, 275 insertions(+), 105 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index f5dc91f19..bb8664069 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -7,6 +7,159 @@ import ( "golang.org/x/exp/slices" ) +/* + +For the BWT usage, please read the its +method documentation. To understand what it is and how +it works for either curiosity or maintenance, then read below. + +# BWT Components + +## BWT Transform + +BWT Stand for (B)urrow (W)heeler (T)ransform. This is done by: +1. Appending a null terminating character to the end of a sequence +2. Rotate the sequence so that the last character is now the first +3. Repeat 2. N times where N is the length of the sequence +4. Lexicographically sort the NxN matrix of rotated sequences where + the null termination character is always the least-valued +5. Store the first and last column of the matrix. The last column + is the output of the BWT. The first column is needed to run queries + on the BWT of the original sequence. + +Lets use banana as an example. + +banana$ $banana +$banana a$banan +a$banan ana$ban +na$bana => anana$b +ana$ban banana$ +nana$ba na$bana +anana$b nana$ba + +Output: + +Last Column (BWT): annb$aa +First Column: $aaabnn + +## LF Mapping Properties + +From now on we will refer to the Last Column as L and the First as F + +There are a few special properties here to be aware of. First, notice +how the characters of the same rank show up in the same order for each +column: + +L: a0 n0 n1 b0 $0 a1 a2 + +F: $0 a0 a1 a2 b0 n0 n1 + +That is to say the characters' rank for each column appear in ascending +order. For example: a0 < a1 < a2. This is true for all BWTs + +The other important property to observe is that since the BWT is the +result of rotating each string, each character in the L column precedes +the corresponding character in the F column. + +To best way to show this is to rebuild the original sequence +using the F and L columns. We do this by rebuilding the original string +in reverse order starting with the nullChar. + +Original string: ______$0 + +F($0) -> L(a0) -> _____a0$0 +F(a0) -> L(n0) -> ____n0a0$0 +F(n0) -> L(a1) -> ___a1n0a0$0 +F(a1) -> L(n1) -> __n1a1n0a0$0 +F(n1) -> L(a2) -> _a2n1a1n0a0$0 +F(a2) -> L(b0) -> b0a2n1a1n0a0$0 +F(b0) -> L($0) -> Complete + +If we take the rank subscripts away from: b0a2n1a1n0a0$0 +We get... "banana$" ! + +## LF Mapping + +From these properties, the most important concept emerges- the LF Mapping. +The LF mapping is what enables us to query and analyze the BWT to gain +insight about the original sequence. + +For example, let's say we wanted to count the number of occurrences of the +pattern "ana" in "banana". We can do this by: + +1. Lookup the last char of the sequence, a, in the F column +2. Find that range of a's, [1, 4) +3. Take the next previous character in the pattern, n +4. Find the rank of n before the range from 2. [0, 1) = 0 +5. Find the rank of n in the range from 2. [1, 4) = 1 +6. Look up the start range of the n's in the F column, 5 +7. Add the result from 4 and 5 respectively to form the next + L search range: [5+0, 5+1) = [5, 6) +8. Take next previous character in the pattern, a +9. Take the rank of "a" before, 0 +10. Take the rank of "a" within, 1 +11. Lookup the a's in the F column again, but add the results + from 9 and 10 to the start range to get the next search + range = [1+0, 1+1) = [1, 2) +12. That is beginning of out pattern, we sub subtract the end and start + of the search range to get out count, 2-1=1 + +Another way to look at this is that we are constantly refining our search +range for each character of the pattern we are searching for. Once we +reach the end of the pattern, our final range represents the a's which +start our pattern. If the range < 0, then at some point our search ranged +has collapsed and there is no matching pattern. + +## Suffix Array + +For other operations such as Locate and Extract, we need another auxiliary +data structure, the suffix array. Since we could be at multiple points +within the original sequence and at any point within that sequence, we need +some kind of point of reference of where we are. We can do this by storing +the position of each original character for each of the corresponding +characters in the F column. With our banana example: + +F: $0 a0 a1 a2 b0 n0 n1 +SA: [6 5 3 1 0 4 2] + +If we take our count example for the pattern "ana" above, you'll remember +that our final search range was [1, 2). If we look up 1 in the SA, we'll +fund that there is only one offset at position 3 in the original sequence +"banana" + +## Notes on Performance + +The explanation above leads to a very naive implementation. For example, +having the full SA would take way more memory than the BWT itself. Assuming +int64, that would 8 times the amount of memory of the BWT in its plain text +representation! In the implementation below, we may instead sample the SA +and do additional look ups as needed to find the offsets we need. + +Similarly, storing the F and L column as plain text has just doubled the +amount of memory from the original sequence... BWT is used for text +compression, not expansion! That's why in the below implementation, you +will see other data structures to actually compress the amount of memory +needed. You will also notice that we can make huge improvements by +compressing sequences like with the F column. + +Instead of: + +F: $0 a0 a1 a2 b0 n0 n1 + +Since F is lexicographically sorted, we can have: + +F: {$: [0, 1)}, {a: [1, 4)}, {b: [4, 5)} {n: [5, 7)} + +Although these performance enhancements may look different from what is +described above, it is still just an FL mapping at the end of the day- just +with more steps. + + +NOTE: The above is just to explain what is happening at a high level. Please +reference the implementation below to see how the BWT is actually currently +working +*/ + const nullChar = "0" // BWT Burrow Wheeler Transform @@ -18,12 +171,12 @@ const nullChar = "0" type BWT struct { // firstColumnSkipList is the first column of the BWT. It is // represented as a list of skipEntries because the first column of - // the BWT is always lexographically ordered. This saves time and memory. + // the BWT is always lexicographically ordered. This saves time and memory. firstColumnSkipList []skipEntry - // lastCoulmn last column of the BWT- the actual textual representation + // Column last column of the BWT- the actual textual representation // of the BWT. lastCoulmn waveletTree - // suffixArray an array that allows us to map a posistion in the first + // suffixArray an array that allows us to map a position in the first // column to a position in the original sequence. This is needed to be // able to extract text from the BWT. suffixArray []int @@ -37,7 +190,7 @@ func (bwt BWT) Count(pattern string) int { } // Locate returns a list of offsets at which the begging -// of the provided pattern occurrs in the original +// of the provided pattern occurs in the original // sequence. func (bwt BWT) Locate(pattern string) []int { searchRange := bwt.lfSearch(pattern) @@ -84,7 +237,7 @@ func (bwt BWT) Len() int { } // getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position -// of the sequence to its corresponding posisiton in the First Column of the BWT +// of the sequence to its corresponding position in the First Column of the BWT func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int { for i := range bwt.suffixArray { if bwt.suffixArray[i] == originalPos { @@ -115,7 +268,7 @@ func (bwt BWT) lfSearch(pattern string) interval { return searchRange } -// lookupSkipByChar looks up a skipEntry by its character in the First Coulmn +// lookupSkipByChar looks up a skipEntry by its character in the First Column func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) { for i := range bwt.firstColumnSkipList { if bwt.firstColumnSkipList[i].char == c { @@ -126,7 +279,7 @@ func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) { } // lookupSkipByOffset looks up a skipEntry based off of an -// offset of the Fist Coulmn of the BWT. +// offset of the Fist Column of the BWT. func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { if offset > bwt.getLenOfOriginalStringWithNullChar()-1 { msg := fmt.Sprintf("offset [%d] exceeds the max bound of the BWT [%d]", offset, bwt.getLenOfOriginalStringWithNullChar()-1) diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index c33374b39..371709f40 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -32,6 +32,12 @@ func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { } } +// Rank returns the rank of the given value up to, but not including +// the ith bit. We count Rank starting a 0. +// For Example: +// Given the bitvector 001000100001 +// Rank(true, 8) = 1 +// Rank(false, 8) = 5 func (rsa rsaBitVector) Rank(val bool, i int) int { if i > rsa.bv.len()-1 { if val { @@ -61,6 +67,11 @@ func (rsa rsaBitVector) Rank(val bool, i int) int { return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) } +// Select returns the the position of the given value of a specified Rank +// For Example: +// Given the bitvector 001000100001 +// Select(true, 1) = 6 +// Rank(false, 5) = 7 func (rsa rsaBitVector) Select(val bool, rank int) (i int, ok bool) { if val { i, ok := rsa.oneSelectMap[rank] @@ -71,6 +82,7 @@ func (rsa rsaBitVector) Select(val bool, rank int) (i int, ok bool) { } } +// Access returns the value of a bit at a given offset func (rsa rsaBitVector) Access(i int) bool { return rsa.bv.getBit(i) } diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 637096bf1..386c09f05 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -7,101 +7,106 @@ import ( "golang.org/x/exp/slices" ) -// For the waveletTree's usage, please read the its -// method documentation. To understand what it is and how -// it works for either curiosity or maintence, then read below. -// -// # The WaveletTree has several imporant components -// -// ## The Character's Path Encoding -// -// one important component is a character's path encoding. -// which character we are working with in a given path in the tree. -// For example, given the alphabet A B C D E F G, a possible encoding is: -// A: 000 -// B: 001 -// C: 010 -// D: 011 -// E: 100 -// F: 101 -// G: 110 -// H: 111 -// -// If we wanted to get to the leaf that represent the character D, we'd -// take the path: -// -// root -// / -// left -// \ -// right -// \ -// right -// -// ## The Data Represented at each node -// -// Let us consider the sequence "bananas" -// It has the alphabet b, a, n, s -// Let's say it has the encoding: -// a: 00 -// n: 01 -// b: 10 -// s: 11 -// and that 0 if left and 1 is right -// We can represent this tree with bitvectors: -// -// 0010101 -// / \ -// 1000 001 -// / \ / \ -// a n b s -// -// If we translate each bit vector to its corresponding string, then it becomes: -// -// bananas -// / \ -// baaa nns -// / \ / \ -// a b n s -// -// ## RSA Intuition -// -// RSA stands for (R)ank, (S)elect, (A)ccess. -// -// From here you may be able to build some intuition as to how we can take RSA queries given -// a characters path encoding and which character we'd like to Rank, Select, and Access. -// -// ### Rank Example -// -// To get WaveletTree.Rank(a, 4) of bananas where a's encoding is 00 -// 1. root.Rank(0, 4) of 0010101 is 2 -// 2. Visit Left Child -// 3. child.Rank(0, 2) of 1000 is 1 -// 4. Visit Left Child -// 5. return 1 -// -// ### Select Example -// -// To get WaveletTree.Select(n, 1) of banans where n's encoding is 01 -// 1. Go down to n's leaf using the path encoding is 01 -// 2. Go back to n's leaf's parent -// 3. parent.Select(0, 1) of 001 is 1 -// 4. Go to the next parent -// 5. parent.Select(1, 1) of 0010101 is 4 -// 6. return 4 since we are at the root. -// -// ### Access Example -// -// If you've reached this point, then you must really be trying to understand how the -// waveletTree works. I recommend thinking through how access could work with the example -// above. HINT: rank might help. -// -// NOTE: The waveletTree does not literally have to be a tree. There are other forms that it may -// exist in like the concatenation of order level representation of all its node's bitvectors... -// as one example. Please reference the implementation if you'd like to understand how this -// specific waveletTree works. - -// waveletTree is datastructure that allows us to index a sequence +/* + +For the waveletTree's usage, please read the its +method documentation. To understand what it is and how +it works for either curiosity or maintenance, then read below. + +# WaveletTree Components + +## The Character's Path Encoding + +One important component is a character's path encoding. +Which character we are working with in a given path in the tree. +For example, given the alphabet A B C D E F G, a possible encoding is: + +A: 000 +B: 001 +C: 010 +D: 011 +E: 100 +F: 101 +G: 110 +H: 111 + +If we wanted to get to the leaf that represent the character D, we'd +take the path: + + root + / +left + \ + right + \ + right + +## The Data Represented at each node + +Let us consider the sequence "bananas" +It has the alphabet b, a, n, s +Let's say it has the encoding: +a: 00 +n: 01 +b: 10 +s: 11 +and that 0 if left and 1 is right +We can represent this tree with bitvectors: + + 0010101 + / \ + 1000 001 + / \ / \ +a n b s + +If we translate each bit vector to its corresponding string, then it becomes: + + bananas + / \ + baaa nns + / \ / \ +a b n s + +## RSA Intuition + +RSA stands for (R)ank, (S)elect, (A)ccess. + +From here you may be able to build some intuition as to how we can take RSA queries given +a characters path encoding and which character we'd like to Rank, Select, and Access. + +### Rank Example + +To get WaveletTree.Rank(a, 4) of bananas where a's encoding is 00 +1. root.Rank(0, 4) of 0010101 is 2 +2. Visit Left Child +3. child.Rank(0, 2) of 1000 is 1 +4. Visit Left Child +5. return 1 + +### Select Example + +To get WaveletTree.Select(n, 1) of bananas where n's encoding is 01 +1. Go down to n's leaf using the path encoding is 01 +2. Go back to n's leaf's parent +3. parent.Select(0, 1) of 001 is 1 +4. Go to the next parent +5. parent.Select(1, 1) of 0010101 is 4 +6. return 4 since we are at the root. + +### Access Example + +If you've reached this point, then you must really be trying to understand how the +waveletTree works. I recommend thinking through how access could work with the example +above. HINT: rank might help. + +NOTE: The waveletTree does not literally have to be a tree. There are other forms that it may +exist in like the concatenation of order level representation of all its node's bitvectors... +as one example. Please reference the implementation if you'd like to understand how this +specific waveletTree works. + +*/ + +// waveletTree is a data structure that allows us to index a sequence // in a memory efficient way that allows us to conduct RSA, (R)ank (S)elect (A)ccess // queries on strings. This is very useful in situations where you'd like to understand // certain aspects of a sequence like: @@ -174,7 +179,7 @@ func (wt waveletTree) Select(char byte, rank int) int { pathBit := ci.path.getBit(ci.path.len() - 1 - level) rank, ok := curr.data.Select(pathBit, rank) if !ok { - msg := fmt.Sprintf("could not find a correspodning bit for node.Select(%t, %d) for characterInfo %+v", pathBit, rank, ci) + msg := fmt.Sprintf("could not find a corresponding bit for node.Select(%t, %d) for characterInfo %+v", pathBit, rank, ci) panic(msg) } } @@ -275,7 +280,7 @@ func isInAlpha(alpha []charInfo, b byte) bool { return false } -// partitionAlpha partitions the alaphabet in half based on whether its corresponding path bit +// partitionAlpha partitions the alphabet in half based on whether its corresponding path bit // is a 0 or 1. 0 with comprise the left tree while 1 will comprise the right. The alphabet // should be sorted in such a way that we remove the most amount of characters nearest to the // root of the tree to reduce the memory footprint as much as possible. @@ -294,7 +299,7 @@ func partitionAlpha(currentLevel int, alpha []charInfo) (left []charInfo, right // getCharInfoDescByRank takes in the bytes of the original // string and return a sorted list of character metadata descending // by rank. The character metadata is important for building the rest -// of the tree along with quering it later on. The sorting is important +// of the tree along with querying it later on. The sorting is important // because this allows us to build the tree in the most memory efficient // way since the characters with the greatest counts will be removed first // before build the subsequent nodes in the lower levels. From 491569551226fb067f960baf1f5e5438804ccf6f Mon Sep 17 00:00:00 2001 From: Trenton Date: Mon, 11 Dec 2023 23:17:19 -0500 Subject: [PATCH 28/60] simplify bitvector, docs for bitvector and rsaBitvector --- bwt/bitvector.go | 71 +++++++++++++------------------------------ bwt/bitvector_test.go | 45 --------------------------- bwt/rsa_bitvector.go | 33 ++++++++++++++++---- 3 files changed, 48 insertions(+), 101 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index de6f7ae7f..b1cd72704 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -4,33 +4,37 @@ import ( "math" ) -// TODO: talk about why this is and why we approximate things to make them "simple enough" const wordSize = 64 +// bitvector a sequence of 1's and 0's. You can also think +// of this as an array of bits. This allows us to encode +// data in a memory efficient manner. type bitvector struct { - bits []uint64 - capacityInChunks int - numberOfBits int + bits []uint64 + numberOfBits int } +// newBitVector will return an initialized bitvector with +// the specified number of zeroed bits. func newBitVector(initialNumberOfBits int) bitvector { capacity := getNumOfBitSetsNeededForNumOfBits(initialNumberOfBits) bits := make([]uint64, capacity) return bitvector{ - bits: bits, - capacityInChunks: capacity, - numberOfBits: initialNumberOfBits, + bits: bits, + numberOfBits: initialNumberOfBits, } } -func (b bitvector) getNumOfBitSets() int { - return getNumOfBitSetsNeededForNumOfBits(b.len()) -} - -func (b bitvector) getBitSet(i int) uint64 { - return b.bits[i] +// getBitSet gets the while word as some offset from the +// bitvector. Useful if you'd prefer to work with the +// word rather than with individual bits. +func (b bitvector) getBitSet(bitSetPos int) uint64 { + return b.bits[bitSetPos] } +// getBit returns the value of the bit at a given offset +// True represents 1 +// False represents 0 func (b bitvector) getBit(i int) bool { b.checkBounds(i) @@ -40,6 +44,9 @@ func (b bitvector) getBit(i int) bool { return (b.bits[chunkStart] & (uint64(1) << (63 - offset))) != 0 } +// setBit sets the value of the bit at a given offset +// True represents 1 +// False represents 0 func (b bitvector) setBit(i int, val bool) { b.checkBounds(i) @@ -52,6 +59,7 @@ func (b bitvector) setBit(i int, val bool) { b.bits[chunkStart] &= ^(uint64(1) << (63 - offset)) } } + func (b bitvector) checkBounds(i int) { if i >= b.len() || i < 0 { panic("better out of bounds message") @@ -61,47 +69,10 @@ func (b bitvector) checkBounds(i int) { const factor1point2Threshold = 1e9 const factor1point5Threshold = 1e6 -func (b *bitvector) push(val bool) { - previousNumberOfBits := b.numberOfBits - nextNumberOfBits := previousNumberOfBits + 1 - if getNumOfBitSetsNeededForNumOfBits(nextNumberOfBits) <= b.capacityInChunks { - b.numberOfBits = nextNumberOfBits - b.setBit(previousNumberOfBits, val) - return - } - - var numOfBitsForNextCapacity int - switch true { - case nextNumberOfBits >= factor1point2Threshold: - numOfBitsForNextCapacity = int(math.Ceil(float64(previousNumberOfBits) * 1.2)) - break - case nextNumberOfBits >= factor1point5Threshold: - numOfBitsForNextCapacity = int(math.Ceil(float64(previousNumberOfBits) * 1.5)) - break - default: - numOfBitsForNextCapacity = previousNumberOfBits * 2 - } - - nextCapacity := getNumOfBitSetsNeededForNumOfBits(numOfBitsForNextCapacity) - - nextBits := make([]uint64, nextCapacity) - copy(b.bits, nextBits) - b.bits = nextBits - - b.numberOfBits = nextNumberOfBits - b.capacityInChunks = nextCapacity - - b.setBit(previousNumberOfBits, val) -} - func (b bitvector) len() int { return b.numberOfBits } -func (b bitvector) capacity() int { - return b.capacityInChunks -} - func getNumOfBitSetsNeededForNumOfBits(n int) int { return int(math.Ceil(float64(n) / wordSize)) } diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index 68c97ac92..c71003aab 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -11,14 +11,9 @@ type GetBitTestCase struct { func TestBitVector(t *testing.T) { initialNumberOfBits := wordSize*10 + 1 - expectedCapacity := 11 bv := newBitVector(initialNumberOfBits) - if bv.capacity() != expectedCapacity { - t.Fatalf("expected capacity to be %d but got %d", expectedCapacity, bv.capacity()) - } - if bv.len() != initialNumberOfBits { t.Fatalf("expected len to be %d but got %d", initialNumberOfBits, bv.len()) } @@ -115,43 +110,3 @@ func TestBitVectorBoundPanic_SetBit_Upper(t *testing.T) { bv := newBitVector(initialNumberOfBits) bv.setBit(initialNumberOfBits, true) } - -func TestBitVectorPush_NextPushLessThanCapacity_Single(t *testing.T) { - initialNumberOfBits := wordSize*10 + 1 - bv := newBitVector(initialNumberOfBits) - bv.push(true) - - expectedCapacity := 11 - if bv.capacity() != expectedCapacity { - t.Fatalf("expected capacity to be %d but got %d", expectedCapacity, bv.capacity()) - } - - expectedLength := initialNumberOfBits + 1 - if bv.len() != expectedLength { - t.Fatalf("expected len to be %d but got %d", expectedLength, bv.len()) - } - - if bv.getBit(initialNumberOfBits) != true { - t.Fatalf("expected %dth bit to be %t but got %t", initialNumberOfBits, true, bv.getBit(initialNumberOfBits)) - } -} - -func TestBitVectorPush_NextPushGreaterThanCapacity_Single(t *testing.T) { - initialNumberOfBits := wordSize * 10 - bv := newBitVector(initialNumberOfBits) - initialCapacity := bv.capacity() - bv.push(true) - - if bv.capacity() <= initialCapacity { - t.Fatalf("expected capacity to have grown. currently the capacity is %d and was previously %d", bv.capacity(), initialCapacity) - } - - expectedLength := initialNumberOfBits + 1 - if bv.len() != expectedLength { - t.Fatalf("expected len to be %d but got %d", expectedLength, bv.len()) - } - - if bv.getBit(initialNumberOfBits) != true { - t.Fatalf("expected %dth bit to be %t but got %t", initialNumberOfBits, true, bv.getBit(initialNumberOfBits)) - } -} diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 371709f40..565060931 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -2,8 +2,8 @@ package bwt import "math/bits" -// TODO: doc what rsa is, why these DSAs, and why we take in a bit vector -// TODO: clarks select +// rsaBitVector allows us to perform RSA: (R)ank, (S)elect, and (A)ccess +// queries in a memory performant and memory compact way. type rsaBitVector struct { bv bitvector totalOnesRank int @@ -15,7 +15,10 @@ type rsaBitVector struct { zeroSelectMap map[int]int } -// TODO: talk about why bv should never be modidifed after building the RSA bit vector +// newRSABitVectorFromBitVector allows us to build the auxillary components +// needed to perform RSA queries on top of the provided bitvector. +// WARNING: Do not modify the underlying bitvector. The rsaBitvector will +// get out of sync with the original bitvector. func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { jacobsonRankChunks, jrSubChunksPerChunk, jrBitsPerSubChunk, totalOnesRank := buildJacobsonRank(bv) ones, zeros := buildSelectMaps(bv) @@ -96,7 +99,26 @@ type subChunk struct { onesCumulativeRank int } -// TODO: talk about easy to read instead vs perf +/* +buildJacobsonRank Jacobson rank is a succinct data structure. This allows us to represent something +normally would require O(N) worth of memory with less that N memory. Jacobson Rank allows for +sub linear growth. Jacobson rank also allows us to lookup rank for some value of a bitvector in O(1) +time. Theoretically, Jacobson Rank tells us to: +1. Create log(N) "Chunks" +2. Create 2log(N) "Sub Chunks" +3. Have "Sub Chunks" be 0.5log(N) in length +4. For each "Chunk", store the cumulative rank of set bits relative to the overall bitvector +5. For each "Sub Chunk", store the cumulative rank of set bits relative to the parent "Chunk" +6. We can One's count the N bit word if possible. We will only consider this possibility :) + +For simplicity and all around good results, we just have "Sub Chunks" of size 64 bits. + +It is O(1) because given some offset i, all we have to do is calculate rank is: +rank = CumulativeRank(ChunkOfi(i))) + CumulativeRank(SubChunkOfi(i))) + OnesCount(SubChunkOfi(i)) + +To understand why it is sub linear in space, you can refer to Ben Langmead and other literature that +describes this complexity. +*/ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk, totalRank int) { // TODO: talk about why this is probably good enough, improves as n grows, gets worse as n gets smaller, and how this fits into a machine instruction, and how this is "simple" numOfSubChunksPerChunk = 4 @@ -137,8 +159,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize, totalRank } -// TODO: talk about how this could be improved memory wise. Talk about how clarks select exists, but keeping it "simple for now" but maybe worth -// making succinct later +// This is not good. We should find a better means of select- like Clark's Select func buildSelectMaps(inBv bitvector) (oneSelectMap, zeroSelectMap map[int]int) { oneSelectMap = make(map[int]int) zeroSelectMap = make(map[int]int) From 5c97fa7d02aa433262db2598f1b75e7c781447e8 Mon Sep 17 00:00:00 2001 From: Trenton Date: Mon, 11 Dec 2023 23:40:16 -0500 Subject: [PATCH 29/60] fix wavelet select --- bwt/wavelet.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 386c09f05..40cfb09a5 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -177,11 +177,12 @@ func (wt waveletTree) Select(char byte, rank int) int { curr = curr.parent level-- pathBit := ci.path.getBit(ci.path.len() - 1 - level) - rank, ok := curr.data.Select(pathBit, rank) + nextRank, ok := curr.data.Select(pathBit, rank) if !ok { msg := fmt.Sprintf("could not find a corresponding bit for node.Select(%t, %d) for characterInfo %+v", pathBit, rank, ci) panic(msg) } + rank = nextRank } return rank From a4548e39f6a349c535714ec0951265657cef839c Mon Sep 17 00:00:00 2001 From: Trenton Date: Mon, 11 Dec 2023 23:43:42 -0500 Subject: [PATCH 30/60] lint --- bwt/bitvector.go | 3 --- bwt/bwt_test.go | 3 +-- bwt/rsa_bitvector.go | 2 +- bwt/rsa_bitvector_test.go | 3 --- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index b1cd72704..1d7309ffd 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -66,9 +66,6 @@ func (b bitvector) checkBounds(i int) { } } -const factor1point2Threshold = 1e9 -const factor1point5Threshold = 1e6 - func (b bitvector) len() int { return b.numberOfBits } diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index 3abe3b3b0..7900e2092 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -76,7 +76,6 @@ type BWTLocateTestCase struct { } func TestBWT_Locate(t *testing.T) { - inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" bwt2, err := New(inputSequence) @@ -228,7 +227,7 @@ func TestBWT_Extract_DoNotAllowExtractionOfLastNullChar(t *testing.T) { t.Fatalf("extractRange=(%d, %d) expected=%s actual=%s", 0, 6, testStr, str) } - str = bwt.Extract(0, 7) + _ = bwt.Extract(0, 7) t.Fatalf("extractRange=(%d, %d) expected panic so we do not allow access to the null character", 0, 7) } diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 565060931..aaa543ed8 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -15,7 +15,7 @@ type rsaBitVector struct { zeroSelectMap map[int]int } -// newRSABitVectorFromBitVector allows us to build the auxillary components +// newRSABitVectorFromBitVector allows us to build the auxiliary components // needed to perform RSA queries on top of the provided bitvector. // WARNING: Do not modify the underlying bitvector. The rsaBitvector will // get out of sync with the original bitvector. diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index a2c55b7b3..8c2f68951 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -39,7 +39,6 @@ func TestRSARank_singlePartialChunk(t *testing.T) { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } } - } func TestRSARank_singleCompleteChunk_PastBounds_Ones(t *testing.T) { @@ -62,7 +61,6 @@ func TestRSARank_singleCompleteChunk_PastBounds_Ones(t *testing.T) { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } } - } func TestRSARank_singleCompleteChunk_PastBounds_Zeros(t *testing.T) { @@ -85,7 +83,6 @@ func TestRSARank_singleCompleteChunk_PastBounds_Zeros(t *testing.T) { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } } - } func TestRSARank_singleCompleteChunk(t *testing.T) { From 0168d0eb6a6c95c9e8a89ca78dcf81aeac007efc Mon Sep 17 00:00:00 2001 From: Trenton Date: Mon, 11 Dec 2023 23:45:16 -0500 Subject: [PATCH 31/60] more lint --- bwt/rsa_bitvector_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index 8c2f68951..3c1b969a6 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -157,7 +157,6 @@ func TestRSARank_singleCompleteChunk(t *testing.T) { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } } - } func TestRSARank_multipleChunks(t *testing.T) { @@ -239,7 +238,6 @@ func TestRSARank_multipleChunks(t *testing.T) { t.Fatalf("expected rank(%t, %d) to be %d but got %d", tc.val, tc.bitPosition, tc.expectedRank, rank) } } - } type rsaSelectTestCase struct { From 42abe6c85e4677201df88d323d92bacc189f7655 Mon Sep 17 00:00:00 2001 From: Trenton Date: Mon, 11 Dec 2023 23:53:27 -0500 Subject: [PATCH 32/60] doc adjustments --- bwt/bwt.go | 16 ++++++++++++---- bwt/wavelet.go | 8 +++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index bb8664069..0874c3ba4 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -13,11 +13,19 @@ For the BWT usage, please read the its method documentation. To understand what it is and how it works for either curiosity or maintenance, then read below. -# BWT Components +# BWT + +BWT Stand for (B)urrow (W)heeler (T)ransform. The BWT aids in +text compression and acts as a search index for any arbitrary +sequence of characters. With the BWT and some auxiliary data +structures, we can analyze a sequence in a memory and run time +efficient manner. ## BWT Transform -BWT Stand for (B)urrow (W)heeler (T)ransform. This is done by: +The first step to build the BWT is to get the BWT itself. + +This is done by: 1. Appending a null terminating character to the end of a sequence 2. Rotate the sequence so that the last character is now the first 3. Repeat 2. N times where N is the length of the sequence @@ -78,7 +86,7 @@ F(b0) -> L($0) -> Complete If we take the rank subscripts away from: b0a2n1a1n0a0$0 We get... "banana$" ! -## LF Mapping +## LF Mapping Usage From these properties, the most important concept emerges- the LF Mapping. The LF mapping is what enables us to query and analyze the BWT to gain @@ -108,7 +116,7 @@ Another way to look at this is that we are constantly refining our search range for each character of the pattern we are searching for. Once we reach the end of the pattern, our final range represents the a's which start our pattern. If the range < 0, then at some point our search ranged -has collapsed and there is no matching pattern. +has collapsed and we can conclude that there is no matching pattern. ## Suffix Array diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 40cfb09a5..529b9de16 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -13,7 +13,11 @@ For the waveletTree's usage, please read the its method documentation. To understand what it is and how it works for either curiosity or maintenance, then read below. -# WaveletTree Components +# WaveletTree + +The Wavelet Tree allows us to conduct RSA queries on strings. in +a memory and run time efficient manner. +RSA stands for (R)ank, (S)elect, (A)ccess. ## The Character's Path Encoding @@ -69,8 +73,6 @@ a b n s ## RSA Intuition -RSA stands for (R)ank, (S)elect, (A)ccess. - From here you may be able to build some intuition as to how we can take RSA queries given a characters path encoding and which character we'd like to Rank, Select, and Access. From 8d661a14d86af96218b9eecbe8627dfdeb8b1e3b Mon Sep 17 00:00:00 2001 From: Trenton Date: Tue, 12 Dec 2023 00:06:20 -0500 Subject: [PATCH 33/60] changelog and ensure correct nullChar sorting --- CHANGELOG.md | 3 ++- bwt/bwt.go | 29 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c2fe0c054..a809b0c5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) - Added a parser and writer for the `pileup` sequence alignment format (#329) +- Basic BWT for sub-sequence count and offset for sequence alignment. Only supports exact matches for now. ### Fixed - `fastq` parser no longer becomes de-aligned when reading (#325) @@ -19,4 +20,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Oops, we weren't keeping a changelog before this tag! [unreleased]: https://github.com/TimothyStiles/poly/compare/v0.26.0...main -[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 \ No newline at end of file +[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 diff --git a/bwt/bwt.go b/bwt/bwt.go index 0874c3ba4..e8f0c3b96 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -2,6 +2,7 @@ package bwt import ( "fmt" + "math" "strings" "golang.org/x/exp/slices" @@ -168,7 +169,7 @@ reference the implementation below to see how the BWT is actually currently working */ -const nullChar = "0" +const nullChar = "$" // BWT Burrow Wheeler Transform // Compresses and Indexes a given sequence so that it can be @@ -337,10 +338,7 @@ func New(sequence string) (BWT, error) { prefixArray[i] = sequence[len(sequence)-i-1:] } - // TODO: at the time of writing, the nullChar is 0, this is to ensure correctness in most cases. - // Do we want to roll our own sorting so we can make sure whatever is defined as the nullChar - // will absolutely be defined as the least? - slices.Sort(prefixArray) + sortPrefixArray(prefixArray) suffixArray := make([]int, len(sequence)) lastColBuilder := strings.Builder{} @@ -391,3 +389,24 @@ func getBWTIndex(lenOfSequenceBeingBuilt, lenOfSuffixArrayVisited int) int { } return bwtCharIndex } + +func sortPrefixArray(prefixArray []string) { + slices.SortFunc(prefixArray, func(a, b string) bool { + minLen := int(math.Min(float64(len(a)), float64(len(b)))) + for i := 0; i < minLen; i++ { + if a[i] == b[i] { + continue + } + if a[i] == byte(nullChar[0]) { + return true + } + if b[i] == byte(nullChar[0]) { + return false + } + return a[i] < b[i] + } + + return len(a) < len(b) + }) + +} From a520093302bcbc71f203c9ac38f3b8b378c7f39d Mon Sep 17 00:00:00 2001 From: Trenton Date: Tue, 12 Dec 2023 00:29:39 -0500 Subject: [PATCH 34/60] fix changelog --- CHANGELOG.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa13f0125..a939d8615 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,17 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) -- Added a parser and writer for the `pileup` sequence alignment format (#329) - Basic BWT for sub-sequence count and offset for sequence alignment. Only supports exact matches for now. ### Fixed -- `fastq` parser no longer becomes de-aligned when reading (#325) -- `fastq` now handles optionals correctly (#323) - Fixed bug that produced wrong overhang in linear, non-directional, single cut reactions. #408 - -## [0.26.0] - 2023-07-22 -Oops, we weren't keeping a changelog before this tag! - -[unreleased]: https://github.com/TimothyStiles/poly/compare/v0.26.0...main -[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 From f6c2bb9dfcbf1524c1d65784baf12ba2fd497ed0 Mon Sep 17 00:00:00 2001 From: Trenton Date: Tue, 12 Dec 2023 00:33:23 -0500 Subject: [PATCH 35/60] golanglintci fixes --- bwt/bwt.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index e8f0c3b96..7b8cbabb7 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -397,10 +397,10 @@ func sortPrefixArray(prefixArray []string) { if a[i] == b[i] { continue } - if a[i] == byte(nullChar[0]) { + if a[i] == nullChar[0] { return true } - if b[i] == byte(nullChar[0]) { + if b[i] == nullChar[0] { return false } return a[i] < b[i] @@ -408,5 +408,4 @@ func sortPrefixArray(prefixArray []string) { return len(a) < len(b) }) - } From f5b459b07749bf1b4f45f7f2d034d23cd97aaf5b Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 13 Dec 2023 17:50:47 -0500 Subject: [PATCH 36/60] bubble up errs instead of panics --- bwt/bwt.go | 37 +++++++++++++++++++++++++------------ bwt/bwt_test.go | 45 ++++++++++++++++++++++++++++----------------- 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 7b8cbabb7..606828fb7 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -193,27 +193,31 @@ type BWT struct { // Count represents the number of times the provided pattern // shows up in the original sequence. -func (bwt BWT) Count(pattern string) int { +func (bwt BWT) Count(pattern string) (count int, err error) { + // defer func() { BWTRecoverAPIBoundary("Count", *err) }() + searchRange := bwt.lfSearch(pattern) - return searchRange.end - searchRange.start + return searchRange.end - searchRange.start, nil } // Locate returns a list of offsets at which the begging // of the provided pattern occurs in the original // sequence. -func (bwt BWT) Locate(pattern string) []int { +func (bwt BWT) Locate(pattern string) (offsets []int, err error) { + // defer func() { BWTRecoverAPIBoundary("Locate") }() + searchRange := bwt.lfSearch(pattern) if searchRange.start >= searchRange.end { - return nil + return nil, nil } numOfOffsets := searchRange.end - searchRange.start - offsets := make([]int, numOfOffsets) + offsets = make([]int, numOfOffsets) for i := 0; i < numOfOffsets; i++ { offsets[i] = bwt.suffixArray[searchRange.start+i] } - return offsets + return offsets, nil } // Extract this allows us to extract parts of the original @@ -221,14 +225,15 @@ func (bwt BWT) Locate(pattern string) []int { // start is the begging of the range of text to extract inclusive. // end is the end of the range of text to extract exclusive. // If either start or end are out of bounds, Extract will panic. -func (bwt BWT) Extract(start, end int) string { +func (bwt BWT) Extract(start, end int) (extracted string, err error) { + defer bwtRecovery("Extract", &err) + if end > bwt.getLenOfOriginalStringWithNullChar()-1 { - msg := fmt.Sprintf("end [%d] exceeds the max range of the BWT [%d]", end, bwt.getLenOfOriginalStringWithNullChar()-1) - panic(msg) + return "", fmt.Errorf("end [%d] exceeds the max range of the BWT [%d]", end, bwt.getLenOfOriginalStringWithNullChar()-1) } + if start < 0 { - msg := fmt.Sprintf("start [%d] exceeds the min range of the BWT [0]", start) - panic(msg) + return "", fmt.Errorf("start [%d] exceeds the min range of the BWT [0]", start) } strB := strings.Builder{} @@ -237,7 +242,8 @@ func (bwt BWT) Extract(start, end int) string { skip := bwt.lookupSkipByOffset(fPos) strB.WriteByte(skip.char) } - return strB.String() + + return strB.String(), nil } // Len return the length of the sequence used to build the BWT @@ -409,3 +415,10 @@ func sortPrefixArray(prefixArray []string) { return len(a) < len(b) }) } + +func bwtRecovery(operation string, err *error) { + if r := recover(); r != nil { + rErr := fmt.Errorf("BWT %s InternalError=%s", operation, r) + *err = rErr + } +} diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index 7900e2092..37ba60f1c 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -49,7 +49,10 @@ func TestBWT_Count(t *testing.T) { } for _, v := range testTable { - count := bwt.Count(v.seq) + count, err := bwt.Count(v.seq) + if err != nil { + t.Fatalf("seq=%s unexpectedError=%s", v.seq, err) + } if count != v.expected { t.Fatalf("seq=%s expectedCount=%v actualCount=%v", v.seq, v.expected, count) } @@ -64,7 +67,10 @@ func ExampleBWT_Locate() { log.Fatal(err) } - offsets := bwt.Locate("CG") + offsets, err := bwt.Locate("CG") + if err != nil { + log.Fatal(err) + } slices.Sort(offsets) fmt.Println(offsets) // Output: [7 10 20 23 25 30 33 38 45 50] @@ -76,15 +82,6 @@ type BWTLocateTestCase struct { } func TestBWT_Locate(t *testing.T) { - inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" - - bwt2, err := New(inputSequence) - if err != nil { - log.Fatal(err) - } - - offsets := bwt2.Locate("CG") - slices.Sort(offsets) baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" // len == 112 testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") @@ -107,7 +104,10 @@ func TestBWT_Locate(t *testing.T) { } for _, v := range testTable { - offsets := bwt.Locate(v.seq) + offsets, err := bwt.Locate(v.seq) + if err != nil { + t.Fatalf("seq=%s unexpectedError=%s", v.seq, err) + } slices.Sort(offsets) if len(offsets) != len(v.expected) { t.Fatalf("seq=%s expectedOffsets=%v actualOffsets=%v", v.seq, v.expected, offsets) @@ -206,7 +206,10 @@ func TestBWT_Extract(t *testing.T) { } for _, v := range testTable { - str := bwt.Extract(v.start, v.end) + str, err := bwt.Extract(v.start, v.end) + if err != nil { + t.Fatalf("extractRange=(%d, %d) unexpectedError=%s", v.start, v.end, err) + } if str != v.expected { t.Fatalf("extractRange=(%d, %d) expected=%s actual=%s", v.start, v.end, v.expected, str) } @@ -214,7 +217,6 @@ func TestBWT_Extract(t *testing.T) { } func TestBWT_Extract_DoNotAllowExtractionOfLastNullChar(t *testing.T) { - defer func() { _ = recover() }() testStr := "banana" bwt, err := New(testStr) @@ -222,14 +224,23 @@ func TestBWT_Extract_DoNotAllowExtractionOfLastNullChar(t *testing.T) { t.Fatal(err) } - str := bwt.Extract(0, 6) + str, err := bwt.Extract(0, 6) + if err != nil { + t.Fatalf("extractRange=(%d, %d) unexpectedError=%s", 0, 6, err) + } if str != testStr { t.Fatalf("extractRange=(%d, %d) expected=%s actual=%s", 0, 6, testStr, str) } - _ = bwt.Extract(0, 7) + _, err = bwt.Extract(0, 7) + + if err == nil { + t.Fatalf("extractRange=(%d, %d) expected err but was nil", 0, 7) + } - t.Fatalf("extractRange=(%d, %d) expected panic so we do not allow access to the null character", 0, 7) + if !strings.Contains(err.Error(), "exceeds the max range") { + t.Fatalf("expected error to contain \"exceeds the max range\" but received \"%s\"", err) + } } func TestBWT_Len(t *testing.T) { From dcd5aff7a781156b89f9c55c230d11e8fdc1eec3 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 13 Dec 2023 17:57:05 -0500 Subject: [PATCH 37/60] fix examples --- bwt/bwt_test.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index 37ba60f1c..cdaa6ecc3 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -17,7 +17,11 @@ func ExampleBWT_Count() { log.Fatal(err) } - fmt.Println(bwt.Count("CG")) + count, err := bwt.Count("CG") + if err != nil { + log.Fatal(err) + } + fmt.Println(count) // Output: 10 } @@ -128,7 +132,11 @@ func ExampleBWT_Extract() { log.Fatal(err) } - fmt.Println(bwt.Extract(48, 54)) + extracted, err := bwt.Extract(48, 54) + if err != nil { + log.Fatal(err) + } + fmt.Println(extracted) // Output: AACGTG } From 4dde9017517d93627c08f5622669a0e610bbea96 Mon Sep 17 00:00:00 2001 From: Trenton Date: Thu, 14 Dec 2023 01:38:25 -0500 Subject: [PATCH 38/60] add recovery to other public API --- bwt/bwt.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 606828fb7..e713a6440 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -194,7 +194,7 @@ type BWT struct { // Count represents the number of times the provided pattern // shows up in the original sequence. func (bwt BWT) Count(pattern string) (count int, err error) { - // defer func() { BWTRecoverAPIBoundary("Count", *err) }() + defer bwtRecovery("Count", &err) searchRange := bwt.lfSearch(pattern) return searchRange.end - searchRange.start, nil @@ -204,7 +204,7 @@ func (bwt BWT) Count(pattern string) (count int, err error) { // of the provided pattern occurs in the original // sequence. func (bwt BWT) Locate(pattern string) (offsets []int, err error) { - // defer func() { BWTRecoverAPIBoundary("Locate") }() + defer bwtRecovery("Locate", &err) searchRange := bwt.lfSearch(pattern) if searchRange.start >= searchRange.end { From ecc78d3b4d133329fb73fea95c2ae029609eb8a5 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 20 Dec 2023 23:08:45 -0500 Subject: [PATCH 39/60] Cite Ben Langmead. --- bwt/bwt.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bwt/bwt.go b/bwt/bwt.go index e713a6440..55e70d187 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -167,6 +167,9 @@ with more steps. NOTE: The above is just to explain what is happening at a high level. Please reference the implementation below to see how the BWT is actually currently working + +Many of the Idea's come from Ben Langmead. +He has a whole YouTube playlist about BWT Indexing: https://www.youtube.com/watch?v=5G2Db41pSHE&list=PL2mpR0RYFQsADmYpW2YWBrXJZ_6EL_3nu */ const nullChar = "$" From 5b1ce0bf304cff0babf3349a89dea5f5af8aec98 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 20 Dec 2023 23:12:01 -0500 Subject: [PATCH 40/60] fix typo --- bwt/bwt.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 55e70d187..75e46c41c 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -187,7 +187,7 @@ type BWT struct { firstColumnSkipList []skipEntry // Column last column of the BWT- the actual textual representation // of the BWT. - lastCoulmn waveletTree + lastColumn waveletTree // suffixArray an array that allows us to map a position in the first // column to a position in the original sequence. This is needed to be // able to extract text from the BWT. @@ -280,8 +280,8 @@ func (bwt BWT) lfSearch(pattern string) interval { if !ok { return interval{} } - searchRange.start = skip.openEndedInterval.start + bwt.lastCoulmn.Rank(c, searchRange.start) - searchRange.end = skip.openEndedInterval.start + bwt.lastCoulmn.Rank(c, searchRange.end) + searchRange.start = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.start) + searchRange.end = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.end) } return searchRange } @@ -364,7 +364,7 @@ func New(sequence string) (BWT, error) { return BWT{ firstColumnSkipList: buildSkipList(prefixArray), - lastCoulmn: NewWaveletTreeFromString(lastColBuilder.String()), + lastColumn: NewWaveletTreeFromString(lastColBuilder.String()), suffixArray: suffixArray, }, nil } From 660dbdb31ee0151949b5fcb94b527780875cff0b Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:12:57 -0500 Subject: [PATCH 41/60] Update bwt/bwt.go Typo Co-authored-by: Willow Carretero Chavez --- bwt/bwt.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 75e46c41c..5e0c37021 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -133,7 +133,7 @@ SA: [6 5 3 1 0 4 2] If we take our count example for the pattern "ana" above, you'll remember that our final search range was [1, 2). If we look up 1 in the SA, we'll -fund that there is only one offset at position 3 in the original sequence +find that there is only one offset at position 3 in the original sequence "banana" ## Notes on Performance From e4fef76fe97df0f0b20a40272a43b2c96be1105b Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:17:05 -0500 Subject: [PATCH 42/60] Update bwt/wavelet.go doc correction Co-authored-by: Willow Carretero Chavez --- bwt/wavelet.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 529b9de16..6ea3f4859 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -23,7 +23,7 @@ RSA stands for (R)ank, (S)elect, (A)ccess. One important component is a character's path encoding. Which character we are working with in a given path in the tree. -For example, given the alphabet A B C D E F G, a possible encoding is: +For example, given the alphabet A B C D E F G H, a possible encoding is: A: 000 B: 001 From dd29d3e3eb512afafc2204133973a24e7c01180a Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:17:28 -0500 Subject: [PATCH 43/60] Update bwt/wavelet.go typo Co-authored-by: Willow Carretero Chavez --- bwt/wavelet.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 6ea3f4859..10c33b41c 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -54,7 +54,7 @@ a: 00 n: 01 b: 10 s: 11 -and that 0 if left and 1 is right +and that 0 is left and 1 is right We can represent this tree with bitvectors: 0010101 From a69d8c54dbab1b16df17372904ba0895ed92c2de Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:18:15 -0500 Subject: [PATCH 44/60] Update bwt/wavelet.go doc clarity Co-authored-by: Willow Carretero Chavez --- bwt/wavelet.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 10c33b41c..bcab8cde3 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -35,7 +35,8 @@ G: 110 H: 111 If we wanted to get to the leaf that represent the character D, we'd -take the path: +take the path that corresponds to the character's encoding, considering a 0 as choosing the left +child of a node and a 1 as choosing the right child of a node: root / From 63e10a4cb8135b4ad23397414854bcf850c7db2e Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:19:57 -0500 Subject: [PATCH 45/60] Update bwt/wavelet.go Link to additional explanation for Wavelet Trees. Co-authored-by: Willow Carretero Chavez --- bwt/wavelet.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index bcab8cde3..678ca6e26 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -19,6 +19,9 @@ The Wavelet Tree allows us to conduct RSA queries on strings. in a memory and run time efficient manner. RSA stands for (R)ank, (S)elect, (A)ccess. +See this blog post by Alex Bowe for an additional explanation: +https://www.alexbowe.com/wavelet-trees/ + ## The Character's Path Encoding One important component is a character's path encoding. From 9e02f5d9481e2199d0f588dac402b78d273f7d11 Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:20:39 -0500 Subject: [PATCH 46/60] Update bwt/wavelet.go doc improvement Co-authored-by: Willow Carretero Chavez --- bwt/wavelet.go | 1 + 1 file changed, 1 insertion(+) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 678ca6e26..74428bb86 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -62,6 +62,7 @@ and that 0 is left and 1 is right We can represent this tree with bitvectors: 0010101 + bananas / \ 1000 001 / \ / \ From 92ed9dafe772a042c3bd63369514a59be0c7a2bf Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:24:27 -0500 Subject: [PATCH 47/60] Update bwt/wavelet.go Doc improvement. Co-authored-by: Willow Carretero Chavez --- bwt/wavelet.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 74428bb86..baa8ee1a6 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -83,6 +83,9 @@ a characters path encoding and which character we'd like to Rank, Select, and Ac ### Rank Example +WaveletTree.Rank(c, n) returns the rank of character c at index n in a sequence, i.e. how many +times c has occurred in a sequence before index n. + To get WaveletTree.Rank(a, 4) of bananas where a's encoding is 00 1. root.Rank(0, 4) of 0010101 is 2 2. Visit Left Child From 08749e3f0eb2b99bdaf472e0eab7143471ba4b0f Mon Sep 17 00:00:00 2001 From: Trenton w Fleming Date: Wed, 20 Dec 2023 23:25:03 -0500 Subject: [PATCH 48/60] Update bwt/wavelet.go Doc Improvement Co-authored-by: Willow Carretero Chavez --- bwt/wavelet.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index baa8ee1a6..c7c8488e8 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -76,6 +76,10 @@ If we translate each bit vector to its corresponding string, then it becomes: / \ / \ a b n s +Each node of the tree consists of a bitvector whose values indicate whether +the character at a particular index is in the left (0) or right (1) child of the +tree. + ## RSA Intuition From here you may be able to build some intuition as to how we can take RSA queries given From a4eb771cb2264313c178c1af52586133fa9a6002 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 20 Dec 2023 23:23:30 -0500 Subject: [PATCH 49/60] doc improvement --- bwt/wavelet.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index c7c8488e8..faa42ba0c 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -95,7 +95,7 @@ To get WaveletTree.Rank(a, 4) of bananas where a's encoding is 00 2. Visit Left Child 3. child.Rank(0, 2) of 1000 is 1 4. Visit Left Child -5. return 1 +5. We are at a leaf node, so return our last recorded rank: 1 ### Select Example From 0791c492888ac1ee0632152397d537ae950fed03 Mon Sep 17 00:00:00 2001 From: Trenton Date: Wed, 20 Dec 2023 23:31:26 -0500 Subject: [PATCH 50/60] doc improvement --- bwt/wavelet.go | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index faa42ba0c..0dca8d977 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -109,9 +109,22 @@ To get WaveletTree.Select(n, 1) of bananas where n's encoding is 01 ### Access Example -If you've reached this point, then you must really be trying to understand how the -waveletTree works. I recommend thinking through how access could work with the example -above. HINT: rank might help. +Take the tree we constructed earlier to represent the sequence "bananas". + + 0010101 + / \ + 1000 001 + / \ / \ +a n b s + +To access the 4th character of the sequence, we would call WaveletTree.Access(3), +which performs the following operations: + +1. root[3] is 0 and root.Rank(0, 3) is 2 +2. Since root[3] is 0, visit left child +3. child[2] is 0 and child.Rank(0, 2) is 1 +4. Since child[2] is 0, visit left child +5. Left child is a leaf, so we've found our value (a)! NOTE: The waveletTree does not literally have to be a tree. There are other forms that it may exist in like the concatenation of order level representation of all its node's bitvectors... From e52ba57728cdf3484f01c756ef76616c253b9b6b Mon Sep 17 00:00:00 2001 From: Trenton Date: Thu, 21 Dec 2023 10:37:47 -0500 Subject: [PATCH 51/60] requested changes, fix edgcases, test edgecases --- bwt/bitvector_test.go | 7 ++ bwt/bwt.go | 53 ++++++++++++-- bwt/bwt_test.go | 142 +++++++++++++++++++++++++++----------- bwt/example_test.go | 58 ++++++++++++++++ bwt/rsa_bitvector_test.go | 4 ++ bwt/wavelet.go | 52 ++++++++++++-- bwt/wavelet_test.go | 76 ++++++++++++++++++-- 7 files changed, 336 insertions(+), 56 deletions(-) create mode 100644 bwt/example_test.go diff --git a/bwt/bitvector_test.go b/bwt/bitvector_test.go index c71003aab..44a1ac118 100644 --- a/bwt/bitvector_test.go +++ b/bwt/bitvector_test.go @@ -31,6 +31,8 @@ func TestBitVector(t *testing.T) { bv.setBit(42, false) bv.setBit(63, false) bv.setBit(64, false) + bv.setBit(255, false) + bv.setBit(256, false) getBitTestCases := []GetBitTestCase{ {0, true}, @@ -54,10 +56,15 @@ func TestBitVector(t *testing.T) { {62, true}, {63, false}, {64, false}, + // Test past the first word {65, true}, {72, true}, {79, true}, {80, true}, + {255, false}, + {256, false}, + {511, true}, + {512, true}, } for _, v := range getBitTestCases { diff --git a/bwt/bwt.go b/bwt/bwt.go index 5e0c37021..283bba61d 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -1,6 +1,7 @@ package bwt import ( + "errors" "fmt" "math" "strings" @@ -198,16 +199,24 @@ type BWT struct { // shows up in the original sequence. func (bwt BWT) Count(pattern string) (count int, err error) { defer bwtRecovery("Count", &err) + err = isValidPattern(pattern) + if err != nil { + return 0, err + } searchRange := bwt.lfSearch(pattern) return searchRange.end - searchRange.start, nil } -// Locate returns a list of offsets at which the begging +// Locate returns a list of offsets at which the beginning // of the provided pattern occurs in the original // sequence. func (bwt BWT) Locate(pattern string) (offsets []int, err error) { defer bwtRecovery("Locate", &err) + err = isValidPattern(pattern) + if err != nil { + return nil, err + } searchRange := bwt.lfSearch(pattern) if searchRange.start >= searchRange.end { @@ -225,11 +234,15 @@ func (bwt BWT) Locate(pattern string) (offsets []int, err error) { // Extract this allows us to extract parts of the original // sequence from the BWT. -// start is the begging of the range of text to extract inclusive. +// start is the beginning of the range of text to extract inclusive. // end is the end of the range of text to extract exclusive. // If either start or end are out of bounds, Extract will panic. func (bwt BWT) Extract(start, end int) (extracted string, err error) { defer bwtRecovery("Extract", &err) + err = validateRange(start, end) + if err != nil { + return "", err + } if end > bwt.getLenOfOriginalStringWithNullChar()-1 { return "", fmt.Errorf("end [%d] exceeds the max range of the BWT [%d]", end, bwt.getLenOfOriginalStringWithNullChar()-1) @@ -336,8 +349,9 @@ type skipEntry struct { // defined in this package. If it does, New will return // an error. func New(sequence string) (BWT, error) { - if strings.Contains(sequence, nullChar) { - return BWT{}, fmt.Errorf("Provided sequence contains the nullChar %s. BWT cannot be constructed", nullChar) + err := validateSequenceBeforeTransforming(&sequence) + if err != nil { + return BWT{}, err } sequence += nullChar @@ -362,9 +376,14 @@ func New(sequence string) (BWT, error) { fb.WriteByte(prefixArray[i][0]) } + wt, err := newWaveletTreeFromString(lastColBuilder.String()) + if err != nil { + return BWT{}, err + } + return BWT{ firstColumnSkipList: buildSkipList(prefixArray), - lastColumn: NewWaveletTreeFromString(lastColBuilder.String()), + lastColumn: wt, suffixArray: suffixArray, }, nil } @@ -425,3 +444,27 @@ func bwtRecovery(operation string, err *error) { *err = rErr } } + +func isValidPattern(s string) (err error) { + if len(s) == 0 { + return errors.New("Pattern can not be empty") + } + return nil +} + +func validateRange(start, end int) (err error) { + if start >= end { + return errors.New("Start must be strictly less than end") + } + return nil +} + +func validateSequenceBeforeTransforming(sequence *string) (err error) { + if len(*sequence) == 0 { + return fmt.Errorf("Provided sequence must not by empty. BWT cannot be constructed") + } + if strings.Contains(*sequence, nullChar) { + return fmt.Errorf("Provided sequence contains the nullChar %s. BWT cannot be constructed", nullChar) + } + return nil +} diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index cdaa6ecc3..c3fc7c163 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -1,30 +1,12 @@ package bwt import ( - "fmt" - "log" "strings" "testing" "golang.org/x/exp/slices" ) -func ExampleBWT_Count() { - inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" - - bwt, err := New(inputSequence) - if err != nil { - log.Fatal(err) - } - - count, err := bwt.Count("CG") - if err != nil { - log.Fatal(err) - } - fmt.Println(count) - // Output: 10 -} - type BWTCountTestCase struct { seq string expected int @@ -49,7 +31,13 @@ func TestBWT_Count(t *testing.T) { {"na", 9}, {"rown", 6}, {"townthe", 2}, + + // patterns that should not exist {"zzz", 0}, + {"clown", 0}, + {"crown", 0}, + {"spark", 0}, + {"brawn", 0}, } for _, v := range testTable { @@ -63,21 +51,16 @@ func TestBWT_Count(t *testing.T) { } } -func ExampleBWT_Locate() { - inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" - - bwt, err := New(inputSequence) +func TestBWT_Count_EmptyPattern(t *testing.T) { + testStr := "banana" + bwt, err := New(testStr) if err != nil { - log.Fatal(err) + t.Fatal(err) } - - offsets, err := bwt.Locate("CG") - if err != nil { - log.Fatal(err) + _, err = bwt.Count("") + if err == nil { + t.Fatal("Expected error for empty pattern but got nil") } - slices.Sort(offsets) - fmt.Println(offsets) - // Output: [7 10 20 23 25 30 33 38 45 50] } type BWTLocateTestCase struct { @@ -104,7 +87,13 @@ func TestBWT_Locate(t *testing.T) { {"na", []int{50, 88, 90, 163, 201, 203, 276, 314, 316}}, {"rown", []int{9, 47, 122, 160, 235, 273}}, {"townthe", []int{109, 222}}, + + // patterns that should not exist {"zzz", nil}, + {"clown", nil}, + {"crown", nil}, + {"spark", nil}, + {"brawn", nil}, } for _, v := range testTable { @@ -124,20 +113,16 @@ func TestBWT_Locate(t *testing.T) { } } -func ExampleBWT_Extract() { - inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" - - bwt, err := New(inputSequence) +func TestBWT_Locate_EmptyPattern(t *testing.T) { + testStr := "banana" + bwt, err := New(testStr) if err != nil { - log.Fatal(err) + t.Fatal(err) } - - extracted, err := bwt.Extract(48, 54) - if err != nil { - log.Fatal(err) + _, err = bwt.Locate("") + if err == nil { + t.Fatal("Expected error for empty pattern but got nil") } - fmt.Println(extracted) - // Output: AACGTG } type BWTExtractTestCase struct { @@ -224,6 +209,22 @@ func TestBWT_Extract(t *testing.T) { } } +func TestBWT_Extract_InvalidRanges(t *testing.T) { + testStr := "banana" + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + _, err = bwt.Extract(5, 4) + if err == nil { + t.Fatal("Expected error for invalid range but got nil") + } + _, err = bwt.Extract(4, 4) + if err == nil { + t.Fatal("Expected error for invalid range but got nil") + } +} + func TestBWT_Extract_DoNotAllowExtractionOfLastNullChar(t *testing.T) { testStr := "banana" @@ -263,3 +264,62 @@ func TestBWT_Len(t *testing.T) { t.Fatalf("expected Len to be %d but got %d", len(testStr), bwt.Len()) } } + +func TestNewBWTWithSequenceContainingNullChar(t *testing.T) { + nc := nullChar + testStr := "banana" + nc + + _, err := New(testStr) + if err == nil { + t.Fatal("expected error but got nil") + } +} + +func TestNewBWTEmptySequence(t *testing.T) { + testStr := "" + + _, err := New(testStr) + if err == nil { + t.Fatal("expected error but got nil") + } +} + +// TestBWTReconstruction this helps us ensure that the LF mapping is correct and that the suffix array lookup +// must be well formed. Otherwise, we would not be able to recreate the original sequence. +func TestBWTReconstruction(t *testing.T) { + baseTestStr := "thequickbrownfoxjumpsoverthelazydogwithanovertfrownafterfumblingitsparallelogramshapedbananagramallarounddowntown" + testStr := strings.Join([]string{baseTestStr, baseTestStr, baseTestStr}, "") + + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + extracted, err := bwt.Extract(0, bwt.Len()) + if err != nil { + t.Fatal(err) + } + if extracted != testStr { + t.Log("Reconstruction failed") + t.Log("Expected:\t", testStr) + t.Log("Actual:\t", extracted) + t.Fail() + } + + // This will either result in an even or all alphabet. The alphabet matters. + testStrWithOneMoreAlpha := testStr + "!" + bwt, err = New(testStrWithOneMoreAlpha) + if err != nil { + t.Fatal(err) + } + extracted, err = bwt.Extract(0, bwt.Len()) + if err != nil { + t.Fatal(err) + } + if extracted != testStrWithOneMoreAlpha { + t.Log("Reconstruction failed with extra alpha character") + t.Log("Expected:\t", testStr) + t.Log("Actual:\t", extracted) + t.Fail() + } +} diff --git a/bwt/example_test.go b/bwt/example_test.go new file mode 100644 index 000000000..7da37ef20 --- /dev/null +++ b/bwt/example_test.go @@ -0,0 +1,58 @@ +package bwt_test + +import ( + "fmt" + "log" + + "github.com/bebop/poly/bwt" + "golang.org/x/exp/slices" +) + +func ExampleBWT_Count() { + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt, err := bwt.New(inputSequence) + if err != nil { + log.Fatal(err) + } + + count, err := bwt.Count("CG") + if err != nil { + log.Fatal(err) + } + fmt.Println(count) + // Output: 10 +} + +func ExampleBWT_Locate() { + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt, err := bwt.New(inputSequence) + if err != nil { + log.Fatal(err) + } + + offsets, err := bwt.Locate("CG") + if err != nil { + log.Fatal(err) + } + slices.Sort(offsets) + fmt.Println(offsets) + // Output: [7 10 20 23 25 30 33 38 45 50] +} + +func ExampleBWT_Extract() { + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt, err := bwt.New(inputSequence) + if err != nil { + log.Fatal(err) + } + + extracted, err := bwt.Extract(48, 54) + if err != nil { + log.Fatal(err) + } + fmt.Println(extracted) + // Output: AACGTG +} diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index 3c1b969a6..7275ac16a 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -181,6 +181,8 @@ func TestRSARank_multipleChunks(t *testing.T) { 0xffffffffffffffff, 0x0000000000000000, + // If Jacobson rank is still there, this should go past the first + // chunk 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff, @@ -201,6 +203,8 @@ func TestRSARank_multipleChunks(t *testing.T) { 0xffffffffffffffff, 0x0000000000000000, + // If Jacobson rank is still there, this should go past the second + // chunk 0xffffffffffffffff, 0x0000000000000000, 0xffffffffffffffff, diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 0dca8d977..5b84f5afc 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -1,6 +1,7 @@ package bwt import ( + "errors" "fmt" "math" @@ -149,6 +150,10 @@ type waveletTree struct { // Access will return the ith character of the original // string used to build the waveletTree func (wt waveletTree) Access(i int) byte { + if wt.root.isLeaf() { + return *wt.root.char + } + curr := wt.root for !curr.isLeaf() { bit := curr.data.Access(i) @@ -159,12 +164,16 @@ func (wt waveletTree) Access(i int) byte { curr = curr.left } } - return curr.char + return *curr.char } // Rank allows us to get the rank of a specified character in // the original string func (wt waveletTree) Rank(char byte, i int) int { + if wt.root.isLeaf() { + return wt.root.data.Rank(true, i) + } + curr := wt.root ci := wt.lookupCharInfo(char) level := 0 @@ -186,6 +195,15 @@ func (wt waveletTree) Rank(char byte, i int) int { // Select allows us to get the corresponding position of a character // in the original string given its rank. func (wt waveletTree) Select(char byte, rank int) int { + if wt.root.isLeaf() { + s, ok := wt.root.data.Select(true, rank) + if !ok { + msg := fmt.Sprintf("could not find a corresponding bit for node.Select(true, %d) root as leaf node", rank) + panic(msg) + } + return s + } + curr := wt.root ci := wt.lookupCharInfo(char) level := 0 @@ -227,14 +245,14 @@ func (wt waveletTree) lookupCharInfo(char byte) charInfo { type node struct { data rsaBitVector - char byte + char *byte parent *node left *node right *node } func (n node) isLeaf() bool { - return n.char != 0 + return n.char != nil } type charInfo struct { @@ -243,16 +261,31 @@ type charInfo struct { path bitvector } -func NewWaveletTreeFromString(str string) waveletTree { +func newWaveletTreeFromString(str string) (waveletTree, error) { + err := validateWaveletTreeBuildInput(&str) + if err != nil { + return waveletTree{}, err + } + bytes := []byte(str) alpha := getCharInfoDescByRank(bytes) root := buildWaveletTree(0, alpha, bytes) + // Handle the case where the provided sequence only has an alphabet + // of size 1 + if root.isLeaf() { + bv := newBitVector(len(bytes)) + for i := 0; i < bv.len(); i++ { + bv.setBit(i, true) + } + root.data = newRSABitVectorFromBitVector(bv) + } + return waveletTree{ root: root, alpha: alpha, - } + }, nil } func buildWaveletTree(currentLevel int, alpha []charInfo, bytes []byte) *node { @@ -261,7 +294,7 @@ func buildWaveletTree(currentLevel int, alpha []charInfo, bytes []byte) *node { } if len(alpha) == 1 { - return &node{char: alpha[0].char} + return &node{char: &alpha[0].char} } leftAlpha, rightAlpha := partitionAlpha(currentLevel, alpha) @@ -379,3 +412,10 @@ func encodeCharPathIntoBitVector(bv bitvector, n uint64) { func getTreeHeight(alpha []charInfo) int { return int(math.Log2(float64(len(alpha)))) + 1 } + +func validateWaveletTreeBuildInput(sequence *string) error { + if len(*sequence) == 0 { + return errors.New("Sequence can not be empty") + } + return nil +} diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index e6d00745c..039ae7002 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -12,7 +12,10 @@ type WaveletTreeAccessTestCase struct { func TestWaveletTree_Access(t *testing.T) { testStr := "AAAACCCCTTTTGGGG" + "ACTG" + "TGCA" + "TTAA" + "CCGG" + "GGGGTTTTCCCCAAAA" - wt := NewWaveletTreeFromString(testStr) + wt, err := newWaveletTreeFromString(testStr) + if err != nil { + t.Fatal(err) + } testCases := []WaveletTreeAccessTestCase{ {0, "A"}, @@ -74,7 +77,10 @@ type WaveletTreeRankTestCase struct { func TestWaveletTree_Rank_Genomic(t *testing.T) { testStr := "AAAACCCCTTTTGGGG" + "ACTG" + "TGCA" + "TTAA" + "CCGG" + "GGGGTTTTCCCCAAAA" - wt := NewWaveletTreeFromString(testStr) + wt, err := newWaveletTreeFromString(testStr) + if err != nil { + t.Fatal(err) + } testCases := []WaveletTreeRankTestCase{ {"A", 0, 0}, @@ -129,7 +135,10 @@ type WaveletTreeSelectTestCase struct { func TestWaveletTree_Select(t *testing.T) { testStr := "AAAACCCCTTTTGGGG" + "ACTG" + "TGCA" + "TTAA" + "CCGG" + "GGGGTTTTCCCCAAAA" - wt := NewWaveletTreeFromString(testStr) + wt, err := newWaveletTreeFromString(testStr) + if err != nil { + t.Fatal(err) + } testCases := []WaveletTreeSelectTestCase{ {"A", 0, 0}, @@ -169,9 +178,14 @@ func TestWaveletTree_Select(t *testing.T) { } } +// TestWaveletTree_Access_Reconstruction these tests are to ensure that the wavelet tree is formed correctly. If we can reconstruct the string, we can be +// fairly confident that the WaveletTree is well formed. func TestWaveletTree_Access_Reconstruction(t *testing.T) { + // Build with a fair sized alphabet enhancedQuickBrownFox := "the quick brown fox jumps over the lazy dog with an overt frown after fumbling its parallelogram shaped bananagram all around downtown" enhancedQuickBrownFoxRepeated := strings.Join([]string{enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox, enhancedQuickBrownFox}, " ") + // Make it very large to account for any succinct data structures being used under the hood. For example, this helped uncover and errors + // diagnose issues with the Jacobson's Rank used under the hood. enhancedQuickBrownFoxSuperLarge := "" for i := 0; i < 100; i++ { enhancedQuickBrownFoxSuperLarge += enhancedQuickBrownFoxRepeated @@ -179,13 +193,17 @@ func TestWaveletTree_Access_Reconstruction(t *testing.T) { testCases := []string{ "the quick brown fox jumped over the lazy dog", + "the quick brown fox jumped over the lazy dog!", // odd numbered alphabet enhancedQuickBrownFox, enhancedQuickBrownFoxRepeated, enhancedQuickBrownFoxSuperLarge, } for _, str := range testCases { - wt := NewWaveletTreeFromString(str) + wt, err := newWaveletTreeFromString(str) + if err != nil { + t.Fatal(err) + } actual := "" for i := 0; i < len(str); i++ { actual += string(wt.Access(i)) @@ -195,3 +213,53 @@ func TestWaveletTree_Access_Reconstruction(t *testing.T) { } } } + +func TestWaveletTreeEmptyStr(t *testing.T) { + str := "" + _, err := newWaveletTreeFromString(str) + if err == nil { + t.Fatal("expected error but got nil") + } +} + +func TestWaveletTreeSingleChar(t *testing.T) { + char := "l" + wt, err := newWaveletTreeFromString(char) + if err != nil { + t.Fatal(err) + } + r := wt.Rank(char[0], 1) + s := wt.Select(char[0], 0) + a := wt.Access(0) + + if r != 1 { + t.Fatalf("expected Rank(%s, %d) to be %d but got %d", char, 1, 1, r) + } + if s != 0 { + t.Fatalf("expected Select(%s, %d) to be %d but got %d", char, 0, 0, s) + } + if a != char[0] { + t.Fatalf("expected Access(%d) to be %d but got %d", 1, 1, s) + } +} + +func TestWaveletTreeSingleAlpha(t *testing.T) { + str := "lll" + wt, err := newWaveletTreeFromString(str) + if err != nil { + t.Fatal(err) + } + r := wt.Rank(str[0], 1) + s := wt.Select(str[0], 1) + a := wt.Access(0) + + if r != 1 { + t.Fatalf("expected Rank(%s, %d) to be %d but got %d", str, 1, 1, r) + } + if s != 1 { + t.Fatalf("expected Select(%s, %d) to be %d but got %d", str, 1, 1, s) + } + if a != str[0] { + t.Fatalf("expected Access(%d) to be %d but got %d", 1, 1, s) + } +} From 6269764045098bd0e59b6a26b9586de54f287df8 Mon Sep 17 00:00:00 2001 From: Trenton Date: Thu, 21 Dec 2023 22:47:35 -0500 Subject: [PATCH 52/60] Fix BWT Locate explanation. Typos and English. --- bwt/bwt.go | 70 +++++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 283bba61d..10a6fba59 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -11,13 +11,13 @@ import ( /* -For the BWT usage, please read the its -method documentation. To understand what it is and how +For the BWT usage, please read the BWT methods +below. To understand what it is and how it works for either curiosity or maintenance, then read below. # BWT -BWT Stand for (B)urrow (W)heeler (T)ransform. The BWT aids in +BWT Stands for (B)urrows-(W)heeler (T)ransform. The BWT aids in text compression and acts as a search index for any arbitrary sequence of characters. With the BWT and some auxiliary data structures, we can analyze a sequence in a memory and run time @@ -75,8 +75,7 @@ To best way to show this is to rebuild the original sequence using the F and L columns. We do this by rebuilding the original string in reverse order starting with the nullChar. -Original string: ______$0 - +Original string: ______$0 F($0) -> L(a0) -> _____a0$0 F(a0) -> L(n0) -> ____n0a0$0 F(n0) -> L(a1) -> ___a1n0a0$0 @@ -101,18 +100,18 @@ pattern "ana" in "banana". We can do this by: 2. Find that range of a's, [1, 4) 3. Take the next previous character in the pattern, n 4. Find the rank of n before the range from 2. [0, 1) = 0 -5. Find the rank of n in the range from 2. [1, 4) = 1 +5. Find the rank of n in the range from 2. [1, 4) = 2 6. Look up the start range of the n's in the F column, 5 7. Add the result from 4 and 5 respectively to form the next - L search range: [5+0, 5+1) = [5, 6) + L search range: [5+0, 5+2) = [5, 7) 8. Take next previous character in the pattern, a -9. Take the rank of "a" before, 0 -10. Take the rank of "a" within, 1 +9. Take the rank of "a" before position 5, which is 1 +10. Take the rank of "a" before position 7, which is 3 11. Lookup the a's in the F column again, but add the results from 9 and 10 to the start range to get the next search - range = [1+0, 1+1) = [1, 2) -12. That is beginning of out pattern, we sub subtract the end and start - of the search range to get out count, 2-1=1 + range = [1+1, 1+3) = [2, 4) +12. That is beginning of our pattern, we sub subtract the end and start + of the search range to get our count, 4-2=2 Another way to look at this is that we are constantly refining our search range for each character of the pattern we are searching for. Once we @@ -123,19 +122,21 @@ has collapsed and we can conclude that there is no matching pattern. ## Suffix Array For other operations such as Locate and Extract, we need another auxiliary -data structure, the suffix array. Since we could be at multiple points -within the original sequence and at any point within that sequence, we need -some kind of point of reference of where we are. We can do this by storing -the position of each original character for each of the corresponding -characters in the F column. With our banana example: +data structure, the suffix array. Since rows of the BWT can map to any +position within the original sequence, we need some kind of reference as to +which BWT rows map to which positions in the original sequence. We can do this by storing +the positions of each character from the original sequence to each of the corresponding +rows in the BWT column. With our banana example: F: $0 a0 a1 a2 b0 n0 n1 SA: [6 5 3 1 0 4 2] If we take our count example for the pattern "ana" above, you'll remember -that our final search range was [1, 2). If we look up 1 in the SA, we'll -find that there is only one offset at position 3 in the original sequence -"banana" +that our final search range was [2, 4). You'll also remember that we counted +2 occurrences of "ana" by subtracting the end of the range from the start, 4-2=2. +If iterate from 2 to 4, we can lookup the corresponding SA entry for the BWT rows 2 and 3. +If we look up 2 in the SA, we'll find that our first offset is at position 3 in the original sequence ban"ana" +If we look up 3 in the SA, we'll find that our second offset is at position 1 in the original sequence b"ana"na ## Notes on Performance @@ -145,12 +146,12 @@ int64, that would 8 times the amount of memory of the BWT in its plain text representation! In the implementation below, we may instead sample the SA and do additional look ups as needed to find the offsets we need. -Similarly, storing the F and L column as plain text has just doubled the -amount of memory from the original sequence... BWT is used for text +Similarly, storing both the F and L column as plain text would take double the +amount of memory to store the original sequence... BWT is used for text compression, not expansion! That's why in the below implementation, you -will see other data structures to actually compress the amount of memory +will see other data structures that lower the amount of memory needed. You will also notice that we can make huge improvements by -compressing sequences like with the F column. +compressing sequences by runs of characters like with the F column. Instead of: @@ -160,22 +161,21 @@ Since F is lexicographically sorted, we can have: F: {$: [0, 1)}, {a: [1, 4)}, {b: [4, 5)} {n: [5, 7)} -Although these performance enhancements may look different from what is -described above, it is still just an FL mapping at the end of the day- just -with more steps. +Although these performance enhancements may lead to a different implementation to what is +described above, any implementation will just be an LF mapping- just with a few more steps. NOTE: The above is just to explain what is happening at a high level. Please reference the implementation below to see how the BWT is actually currently working -Many of the Idea's come from Ben Langmead. +Many of the Ideas come from Ben Langmead. He has a whole YouTube playlist about BWT Indexing: https://www.youtube.com/watch?v=5G2Db41pSHE&list=PL2mpR0RYFQsADmYpW2YWBrXJZ_6EL_3nu */ const nullChar = "$" -// BWT Burrow Wheeler Transform +// BWT Burrows-Wheeler Transform // Compresses and Indexes a given sequence so that it can be // be used for search, alignment, and text extraction. This is // useful for sequences so large that it would be beneficial @@ -269,6 +269,8 @@ func (bwt BWT) Len() int { // getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position // of the sequence to its corresponding position in the First Column of the BWT +// NOTE: This clearly isn't ideal. Instead of improving this implementation, this will be replaced with +// something like r-index in the near future. func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int { for i := range bwt.suffixArray { if bwt.suffixArray[i] == originalPos { @@ -407,9 +409,13 @@ func buildSkipList(prefixArray []string) []skipEntry { return skipList } -// getBWTIndex returns the position of the character from the sequence used to build the BWT -// that corresponds the last character that would exist in the entry of the prefixArray that -// would be the last character if we were actually doing full rotations +// getBWTIndex helps us calculate the corresponding character that would +// be in the L column without having to rotate the full string. +// For example: +// Original string: banana$ +// Rotation: ana$___ +// Position: 7-4-1= 2 +// Original[3]: n func getBWTIndex(lenOfSequenceBeingBuilt, lenOfSuffixArrayVisited int) int { bwtCharIndex := lenOfSequenceBeingBuilt - lenOfSuffixArrayVisited - 1 if bwtCharIndex == -1 { From 307d8817139baf6f86cf971394e2967b0afc6f5e Mon Sep 17 00:00:00 2001 From: Trenton Date: Thu, 21 Dec 2023 23:42:22 -0500 Subject: [PATCH 53/60] fix wavelet tree example and docs --- bwt/wavelet.go | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/bwt/wavelet.go b/bwt/wavelet.go index 5b84f5afc..a100df8fd 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -10,13 +10,13 @@ import ( /* -For the waveletTree's usage, please read the its +For the waveletTree's usage, please read its method documentation. To understand what it is and how it works for either curiosity or maintenance, then read below. # WaveletTree -The Wavelet Tree allows us to conduct RSA queries on strings. in +The Wavelet Tree allows us to conduct RSA queries on strings in a memory and run time efficient manner. RSA stands for (R)ank, (S)elect, (A)ccess. @@ -25,8 +25,9 @@ https://www.alexbowe.com/wavelet-trees/ ## The Character's Path Encoding -One important component is a character's path encoding. -Which character we are working with in a given path in the tree. +Each character from a sequence's alphabet will be assigned a path. +This path encoding represents a path from the Wavelet Tree's root to some +leaf node that represents a character. For example, given the alphabet A B C D E F G H, a possible encoding is: A: 000 @@ -38,9 +39,10 @@ F: 101 G: 110 H: 111 -If we wanted to get to the leaf that represent the character D, we'd -take the path that corresponds to the character's encoding, considering a 0 as choosing the left -child of a node and a 1 as choosing the right child of a node: +If we wanted to get to the leaf that represents the character D, we'd have +to use D's path encoding to traverse the tree. +Consider 0 as the left and 1 as the right. +If we follow D's encoding, 011, then we'd take a path that looks like: root / @@ -66,6 +68,7 @@ We can represent this tree with bitvectors: bananas / \ 1000 001 + baaa nns / \ / \ a n b s @@ -81,10 +84,9 @@ Each node of the tree consists of a bitvector whose values indicate whether the character at a particular index is in the left (0) or right (1) child of the tree. -## RSA Intuition +## RSA -From here you may be able to build some intuition as to how we can take RSA queries given -a characters path encoding and which character we'd like to Rank, Select, and Access. +At this point, we can talk about RSA. RSA stands for (R)ank, (S)elect, (A)ccess. ### Rank Example @@ -92,21 +94,21 @@ WaveletTree.Rank(c, n) returns the rank of character c at index n in a sequence, times c has occurred in a sequence before index n. To get WaveletTree.Rank(a, 4) of bananas where a's encoding is 00 -1. root.Rank(0, 4) of 0010101 is 2 +1. root.Rank(0, 4) of 0010101 is 3 2. Visit Left Child -3. child.Rank(0, 2) of 1000 is 1 +3. child.Rank(0, 3) of 1000 is 2 4. Visit Left Child -5. We are at a leaf node, so return our last recorded rank: 1 +5. We are at a leaf node, so return our last recorded rank: 2 ### Select Example To get WaveletTree.Select(n, 1) of bananas where n's encoding is 01 1. Go down to n's leaf using the path encoding is 01 2. Go back to n's leaf's parent -3. parent.Select(0, 1) of 001 is 1 +3. parent.Select(0, 1) of 001 is 0 4. Go to the next parent -5. parent.Select(1, 1) of 0010101 is 4 -6. return 4 since we are at the root. +5. parent.Select(1, 0) of 0010101 is 2 +6. return 2 since we are at the root. ### Access Example @@ -342,7 +344,7 @@ func isInAlpha(alpha []charInfo, b byte) bool { } // partitionAlpha partitions the alphabet in half based on whether its corresponding path bit -// is a 0 or 1. 0 with comprise the left tree while 1 will comprise the right. The alphabet +// is a 0 or 1. 0 will comprise the left tree while 1 will comprise the right. The alphabet // should be sorted in such a way that we remove the most amount of characters nearest to the // root of the tree to reduce the memory footprint as much as possible. func partitionAlpha(currentLevel int, alpha []charInfo) (left []charInfo, right []charInfo) { From b3f27f4a43425761c9d82299f2cc942714fe3faf Mon Sep 17 00:00:00 2001 From: Trenton Date: Thu, 21 Dec 2023 23:52:46 -0500 Subject: [PATCH 54/60] fix rsa docs and provide better examples --- bwt/rsa_bitvector.go | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index aaa543ed8..0cb9da468 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -4,6 +4,8 @@ import "math/bits" // rsaBitVector allows us to perform RSA: (R)ank, (S)elect, and (A)ccess // queries in a memory performant and memory compact way. +// To learn about how Rank, Select, and Access work, take a look at the +// examples in each respective method. type rsaBitVector struct { bv bitvector totalOnesRank int @@ -36,11 +38,14 @@ func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { } // Rank returns the rank of the given value up to, but not including -// the ith bit. We count Rank starting a 0. +// the ith bit. // For Example: // Given the bitvector 001000100001 -// Rank(true, 8) = 1 -// Rank(false, 8) = 5 +// Rank(true, 1) = 0 +// Rank(true, 2) = 0 +// Rank(true, 3) = 1 +// Rank(true, 8) = 2 +// Rank(false, 8) = 6 func (rsa rsaBitVector) Rank(val bool, i int) int { if i > rsa.bv.len()-1 { if val { @@ -70,11 +75,13 @@ func (rsa rsaBitVector) Rank(val bool, i int) int { return (chunkPos*rsa.jrBitsPerChunk - chunk.onesCumulativeRank) + (subChunkPos*rsa.jrBitsPerSubChunk - subChunk.onesCumulativeRank) + bits.OnesCount64(remaining) } -// Select returns the the position of the given value of a specified Rank +// Select returns the position of the given value with the provided Rank // For Example: // Given the bitvector 001000100001 -// Select(true, 1) = 6 -// Rank(false, 5) = 7 +// Select(true, 1) = 2 +// Rank(false, 5) = 5 +// Rank(false, 1) = 1 +// Rank(false, 0) = 0 func (rsa rsaBitVector) Select(val bool, rank int) (i int, ok bool) { if val { i, ok := rsa.oneSelectMap[rank] @@ -103,24 +110,24 @@ type subChunk struct { buildJacobsonRank Jacobson rank is a succinct data structure. This allows us to represent something normally would require O(N) worth of memory with less that N memory. Jacobson Rank allows for sub linear growth. Jacobson rank also allows us to lookup rank for some value of a bitvector in O(1) -time. Theoretically, Jacobson Rank tells us to: -1. Create log(N) "Chunks" -2. Create 2log(N) "Sub Chunks" -3. Have "Sub Chunks" be 0.5log(N) in length +time. Theoretically, Jacobson Rank Requires: +1. Creating log(N) "Chunks" +2. Creating 2log(N) "Sub Chunks" +3. Having "Sub Chunks" be 0.5log(N) in length 4. For each "Chunk", store the cumulative rank of set bits relative to the overall bitvector 5. For each "Sub Chunk", store the cumulative rank of set bits relative to the parent "Chunk" 6. We can One's count the N bit word if possible. We will only consider this possibility :) -For simplicity and all around good results, we just have "Sub Chunks" of size 64 bits. +For simplicity and all around decent results, we just have "Sub Chunks" of size 64 bits. It is O(1) because given some offset i, all we have to do is calculate rank is: rank = CumulativeRank(ChunkOfi(i))) + CumulativeRank(SubChunkOfi(i))) + OnesCount(SubChunkOfi(i)) To understand why it is sub linear in space, you can refer to Ben Langmead and other literature that -describes this complexity. +describes the space complexity. +https://www.youtube.com/watch?v=M1sUZxXVjG8&list=PL2mpR0RYFQsADmYpW2YWBrXJZ_6EL_3nu&index=7 */ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk, totalRank int) { - // TODO: talk about why this is probably good enough, improves as n grows, gets worse as n gets smaller, and how this fits into a machine instruction, and how this is "simple" numOfSubChunksPerChunk = 4 totalRank = 0 From 3e834ddf903886db2afb6cf6b14ff032478148e3 Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 22 Dec 2023 00:38:09 -0500 Subject: [PATCH 55/60] Fix select. Problems appeared when it started actually getting used in another branch --- bwt/rsa_bitvector.go | 22 ++++++++-------------- bwt/rsa_bitvector_test.go | 20 ++++++++++++++------ 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 0cb9da468..7dd2d0843 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -8,7 +8,6 @@ import "math/bits" // examples in each respective method. type rsaBitVector struct { bv bitvector - totalOnesRank int jrc []chunk jrSubChunksPerChunk int jrBitsPerChunk int @@ -22,12 +21,11 @@ type rsaBitVector struct { // WARNING: Do not modify the underlying bitvector. The rsaBitvector will // get out of sync with the original bitvector. func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { - jacobsonRankChunks, jrSubChunksPerChunk, jrBitsPerSubChunk, totalOnesRank := buildJacobsonRank(bv) + jacobsonRankChunks, jrSubChunksPerChunk, jrBitsPerSubChunk := buildJacobsonRank(bv) ones, zeros := buildSelectMaps(bv) return rsaBitVector{ bv: bv, - totalOnesRank: totalOnesRank, jrc: jacobsonRankChunks, jrSubChunksPerChunk: jrSubChunksPerChunk, jrBitsPerChunk: jrSubChunksPerChunk * jrBitsPerSubChunk, @@ -47,13 +45,6 @@ func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { // Rank(true, 8) = 2 // Rank(false, 8) = 6 func (rsa rsaBitVector) Rank(val bool, i int) int { - if i > rsa.bv.len()-1 { - if val { - return rsa.totalOnesRank - } - return rsa.bv.len() - rsa.totalOnesRank - } - chunkPos := (i / rsa.jrBitsPerChunk) chunk := rsa.jrc[chunkPos] @@ -127,10 +118,9 @@ To understand why it is sub linear in space, you can refer to Ben Langmead and o describes the space complexity. https://www.youtube.com/watch?v=M1sUZxXVjG8&list=PL2mpR0RYFQsADmYpW2YWBrXJZ_6EL_3nu&index=7 */ -func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk, totalRank int) { +func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk int) { numOfSubChunksPerChunk = 4 - totalRank = 0 chunkCumulativeRank := 0 subChunkCumulativeRank := 0 @@ -153,7 +143,6 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun onesCount := bits.OnesCount64(inBv.getBitSet(i)) subChunkCumulativeRank += onesCount - totalRank += onesCount } if currSubChunks != nil { @@ -163,7 +152,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun }) } - return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize, totalRank + return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize } // This is not good. We should find a better means of select- like Clark's Select @@ -183,5 +172,10 @@ func buildSelectMaps(inBv bitvector) (oneSelectMap, zeroSelectMap map[int]int) { } } + // Account for the case where we need to find the + // position for the max rank for both 0's and 1's + oneSelectMap[oneCount] = inBv.len() + zeroSelectMap[zeroCount] = inBv.len() + return oneSelectMap, zeroSelectMap } diff --git a/bwt/rsa_bitvector_test.go b/bwt/rsa_bitvector_test.go index 7275ac16a..d09a9eb2b 100644 --- a/bwt/rsa_bitvector_test.go +++ b/bwt/rsa_bitvector_test.go @@ -254,10 +254,10 @@ func TestRSASelect(t *testing.T) { bitsToTruncate := 17 initialNumberOfBits := wordSize*4 - bitsToTruncate rsa := newTestRSAFromWords(initialNumberOfBits, - 0x8010000000010000, - 0xfff1ffffffffffff, - 0x0000010000000000, - 0xffffffffffffffff, + 0x8010000000010000, // 1Count = 3 + 0xfff1ffffffffffff, // 1Count = 63 + 0x0000010000000000, // 1Count = 1 + 0xffffffffffffffff, // Possible 1Count = 47 ) testCases := []rsaSelectTestCase{ @@ -285,11 +285,19 @@ func TestRSASelect(t *testing.T) { {false, 63, 78}, {true, 64, 151}, + {true, 65, 192}, + {true, 111, 238}, {false, 64, 128}, + {false, 126, 191}, - {true, 65, 192}, - {true, 111, 238}, + // Select of penultimate ranks should be the positions at which they appear. + {true, 111, rsa.bv.len() - 1}, + {false, 126, 191}, + + // Max bitvector positions for the max rank should be at the ends of the bitvector + {true, 112, rsa.bv.len()}, + {false, 127, rsa.bv.len()}, } for _, tc := range testCases { From ff74a1effc9d5c60c29380b1d3be00c989ccd4e4 Mon Sep 17 00:00:00 2001 From: Trenton Date: Fri, 22 Dec 2023 00:46:31 -0500 Subject: [PATCH 56/60] put back the shortcut for rank of char at max position in bv --- bwt/rsa_bitvector.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/bwt/rsa_bitvector.go b/bwt/rsa_bitvector.go index 7dd2d0843..fc2e9ceb8 100644 --- a/bwt/rsa_bitvector.go +++ b/bwt/rsa_bitvector.go @@ -8,6 +8,7 @@ import "math/bits" // examples in each respective method. type rsaBitVector struct { bv bitvector + totalOnesRank int jrc []chunk jrSubChunksPerChunk int jrBitsPerChunk int @@ -21,11 +22,12 @@ type rsaBitVector struct { // WARNING: Do not modify the underlying bitvector. The rsaBitvector will // get out of sync with the original bitvector. func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { - jacobsonRankChunks, jrSubChunksPerChunk, jrBitsPerSubChunk := buildJacobsonRank(bv) + jacobsonRankChunks, jrSubChunksPerChunk, jrBitsPerSubChunk, totalOnesRank := buildJacobsonRank(bv) ones, zeros := buildSelectMaps(bv) return rsaBitVector{ bv: bv, + totalOnesRank: totalOnesRank, jrc: jacobsonRankChunks, jrSubChunksPerChunk: jrSubChunksPerChunk, jrBitsPerChunk: jrSubChunksPerChunk * jrBitsPerSubChunk, @@ -45,6 +47,13 @@ func newRSABitVectorFromBitVector(bv bitvector) rsaBitVector { // Rank(true, 8) = 2 // Rank(false, 8) = 6 func (rsa rsaBitVector) Rank(val bool, i int) int { + if i == rsa.bv.len() { + if val { + return rsa.totalOnesRank + } + return rsa.bv.len() - rsa.totalOnesRank + } + chunkPos := (i / rsa.jrBitsPerChunk) chunk := rsa.jrc[chunkPos] @@ -118,9 +127,10 @@ To understand why it is sub linear in space, you can refer to Ben Langmead and o describes the space complexity. https://www.youtube.com/watch?v=M1sUZxXVjG8&list=PL2mpR0RYFQsADmYpW2YWBrXJZ_6EL_3nu&index=7 */ -func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk int) { +func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChunksPerChunk, numOfBitsPerSubChunk, totalRank int) { numOfSubChunksPerChunk = 4 + totalRank = 0 chunkCumulativeRank := 0 subChunkCumulativeRank := 0 @@ -143,6 +153,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun onesCount := bits.OnesCount64(inBv.getBitSet(i)) subChunkCumulativeRank += onesCount + totalRank += onesCount } if currSubChunks != nil { @@ -152,7 +163,7 @@ func buildJacobsonRank(inBv bitvector) (jacobsonRankChunks []chunk, numOfSubChun }) } - return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize + return jacobsonRankChunks, numOfSubChunksPerChunk, wordSize, totalRank } // This is not good. We should find a better means of select- like Clark's Select From 69c50893e4f9c92d67858387d96d7cc159470d7e Mon Sep 17 00:00:00 2001 From: Trenton Date: Thu, 28 Dec 2023 21:15:28 -0500 Subject: [PATCH 57/60] wt reconstruct and bwt GetTransform with example --- bwt/bwt.go | 5 +++++ bwt/example_test.go | 12 ++++++++++++ bwt/wavelet.go | 18 ++++++++++++++---- bwt/wavelet_test.go | 5 +---- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 10a6fba59..577bb485c 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -267,6 +267,11 @@ func (bwt BWT) Len() int { return bwt.getLenOfOriginalStringWithNullChar() - 1 } +// GetTransform returns the last column of the BWT transform of the original sequence. +func (bwt BWT) GetTransform() string { + return bwt.lastColumn.reconstruct() +} + // getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position // of the sequence to its corresponding position in the First Column of the BWT // NOTE: This clearly isn't ideal. Instead of improving this implementation, this will be replaced with diff --git a/bwt/example_test.go b/bwt/example_test.go index 7da37ef20..40b631066 100644 --- a/bwt/example_test.go +++ b/bwt/example_test.go @@ -56,3 +56,15 @@ func ExampleBWT_Extract() { fmt.Println(extracted) // Output: AACGTG } + +func ExampleBWT_GetTransform() { + inputSequence := "banana" + + bwt, err := bwt.New(inputSequence) + if err != nil { + log.Fatal(err) + } + + fmt.Println(bwt.GetTransform()) + // Output: annb$aa +} diff --git a/bwt/wavelet.go b/bwt/wavelet.go index a100df8fd..af20d5c36 100644 --- a/bwt/wavelet.go +++ b/bwt/wavelet.go @@ -145,8 +145,9 @@ specific waveletTree works. // * locating characters of certain rank within the sequence // * accessing the character at a given position type waveletTree struct { - root *node - alpha []charInfo + root *node + alpha []charInfo + length int } // Access will return the ith character of the original @@ -245,6 +246,14 @@ func (wt waveletTree) lookupCharInfo(char byte) charInfo { panic(msg) } +func (wt waveletTree) reconstruct() string { + str := "" + for i := 0; i < wt.length; i++ { + str += string(wt.Access(i)) + } + return str +} + type node struct { data rsaBitVector char *byte @@ -285,8 +294,9 @@ func newWaveletTreeFromString(str string) (waveletTree, error) { } return waveletTree{ - root: root, - alpha: alpha, + root: root, + alpha: alpha, + length: len(str), }, nil } diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index 039ae7002..ac0289881 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -204,10 +204,7 @@ func TestWaveletTree_Access_Reconstruction(t *testing.T) { if err != nil { t.Fatal(err) } - actual := "" - for i := 0; i < len(str); i++ { - actual += string(wt.Access(i)) - } + actual := wt.reconstruct() if actual != str { t.Fatalf("expected to rebuild:\n%s\nbut instead got:\n%s", str, actual) } From 94f2a151a90cd7ac43fcaad77f7dba84d0ee8f64 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Tue, 2 Jan 2024 13:53:24 -0800 Subject: [PATCH 58/60] added unit tests for reachable panics. --- bwt/bwt.go | 6 ++-- bwt/bwt_test.go | 87 +++++++++++++++++++++++++++++++++++++++++++++ bwt/wavelet_test.go | 23 ++++++++++++ 3 files changed, 113 insertions(+), 3 deletions(-) diff --git a/bwt/bwt.go b/bwt/bwt.go index 577bb485c..447443a44 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -328,9 +328,9 @@ func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { panic(msg) } - for i := range bwt.firstColumnSkipList { - if bwt.firstColumnSkipList[i].openEndedInterval.start <= offset && offset < bwt.firstColumnSkipList[i].openEndedInterval.end { - return bwt.firstColumnSkipList[i] + for skipIndex := range bwt.firstColumnSkipList { + if bwt.firstColumnSkipList[skipIndex].openEndedInterval.start <= offset && offset < bwt.firstColumnSkipList[skipIndex].openEndedInterval.end { + return bwt.firstColumnSkipList[skipIndex] } } panic("figure out what to do here") diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index c3fc7c163..cb835c05c 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -1,6 +1,7 @@ package bwt import ( + "fmt" "strings" "testing" @@ -323,3 +324,89 @@ func TestBWTReconstruction(t *testing.T) { t.Fail() } } + +func TestBWTStartError(t *testing.T) { + testStr := "banana" + + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + _, err = bwt.Extract(-1, 6) + if err == nil { + t.Fatal("expected error but got nil") + } +} +func TestBWT_GetFCharPosFromOriginalSequenceCharPos_Panic(t *testing.T) { + testStr := "banana" + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + // Call the function with an invalid original position + originalPos := -1 + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected panic, but it did not occur") + } + }() + bwt.getFCharPosFromOriginalSequenceCharPos(originalPos) +} +func TestBWT_LFSearch_InvalidChar(t *testing.T) { + testStr := "banana" + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + pattern := "x" // Invalid character + + result := bwt.lfSearch(pattern) + + if result.start != 0 || result.end != 0 { + t.Fatalf("Expected search range to be (0, 0), but got (%d, %d)", result.start, result.end) + } +} +func TestBWT_LookupSkipByOffset_PanicOffsetExceedsMaxBound(t *testing.T) { + testStr := "banana" + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + offset := bwt.getLenOfOriginalStringWithNullChar() + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected panic, but it did not occur") + } + }() + bwt.lookupSkipByOffset(offset) +} + +func TestBWT_LookupSkipByOffset_PanicOffsetExceedsMinBound(t *testing.T) { + testStr := "banana" + bwt, err := New(testStr) + if err != nil { + t.Fatal(err) + } + + offset := -1 + defer func() { + if r := recover(); r == nil { + t.Errorf("Expected panic, but it did not occur") + } + }() + bwt.lookupSkipByOffset(offset) +} + +func TestBWTRecovery(t *testing.T) { + // Test panic recovery for bwtRecovery function + err := fmt.Errorf("test error") + operation := "test operation" + defer bwtRecovery(operation, &err) + + panic("test panic") + t.Errorf("Expected panic, but it did not occur") +} diff --git a/bwt/wavelet_test.go b/bwt/wavelet_test.go index ac0289881..432f4a85e 100644 --- a/bwt/wavelet_test.go +++ b/bwt/wavelet_test.go @@ -260,3 +260,26 @@ func TestWaveletTreeSingleAlpha(t *testing.T) { t.Fatalf("expected Access(%d) to be %d but got %d", 1, 1, s) } } +func TestBuildWaveletTree_ZeroAlpha(t *testing.T) { + bytes := []byte("AAAACCCCTTTTGGGG") + alpha := []charInfo{} + + root := buildWaveletTree(0, alpha, bytes) + + if root != nil { + t.Fatalf("expected root to be nil but got %v", root) + } +} +func TestWaveletTree_LookupCharInfo_Panic(t *testing.T) { + wt := waveletTree{ + alpha: []charInfo{}, + } + + defer func() { + if r := recover(); r == nil { + t.Errorf("expected panic but got nil") + } + }() + + wt.lookupCharInfo('B') +} From 1cc37557ed1da0cd9b1ef350466d5844ef375f11 Mon Sep 17 00:00:00 2001 From: Trenton Date: Tue, 2 Jan 2024 21:00:00 -0500 Subject: [PATCH 59/60] fix err messages, add basic example --- bwt/bitvector.go | 4 +++- bwt/bwt.go | 3 ++- bwt/example_test.go | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/bwt/bitvector.go b/bwt/bitvector.go index 1d7309ffd..d6cbf8144 100644 --- a/bwt/bitvector.go +++ b/bwt/bitvector.go @@ -1,6 +1,7 @@ package bwt import ( + "fmt" "math" ) @@ -62,7 +63,8 @@ func (b bitvector) setBit(i int, val bool) { func (b bitvector) checkBounds(i int) { if i >= b.len() || i < 0 { - panic("better out of bounds message") + msg := fmt.Sprintf("access of %d is out of bounds for bitvector with length %d", i, b.len()) + panic(msg) } } diff --git a/bwt/bwt.go b/bwt/bwt.go index 447443a44..66020166e 100644 --- a/bwt/bwt.go +++ b/bwt/bwt.go @@ -333,7 +333,8 @@ func (bwt BWT) lookupSkipByOffset(offset int) skipEntry { return bwt.firstColumnSkipList[skipIndex] } } - panic("figure out what to do here") + msg := fmt.Sprintf("could not find the skip entry that falls within the range of the skip column at a given offset. range: [0, %d) offset: %d", bwt.getLenOfOriginalStringWithNullChar(), offset) + panic(msg) } func (bwt BWT) getLenOfOriginalStringWithNullChar() int { diff --git a/bwt/example_test.go b/bwt/example_test.go index 40b631066..2d1e1b718 100644 --- a/bwt/example_test.go +++ b/bwt/example_test.go @@ -8,6 +8,28 @@ import ( "golang.org/x/exp/slices" ) +// This example shows how BWT can be used for exact pattern +// matching by returning the offsets at which the pattern exists. +// This can be useful for alignment when you need need to reduce +// the memory footprint of a reference sequence without loosing +// any data since BWT is a lossless compression. +func ExampleBWT_basic() { + inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" + + bwt, err := bwt.New(inputSequence) + if err != nil { + log.Fatal(err) + } + + offsets, err := bwt.Locate("GCC") + if err != nil { + log.Fatal(err) + } + slices.Sort(offsets) + fmt.Println(offsets) + // Output: [5 17] +} + func ExampleBWT_Count() { inputSequence := "AACCTGCCGTCGGGGCTGCCCGTCGCGGGACGTCGAAACGTGGGGCGAAACGTG" From 77ca52d917221c10749a837c7d4aeccb05846fec Mon Sep 17 00:00:00 2001 From: Trenton Date: Tue, 2 Jan 2024 21:08:48 -0500 Subject: [PATCH 60/60] fix test --- bwt/bwt_test.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bwt/bwt_test.go b/bwt/bwt_test.go index cb835c05c..7b5512ffe 100644 --- a/bwt/bwt_test.go +++ b/bwt/bwt_test.go @@ -1,7 +1,6 @@ package bwt import ( - "fmt" "strings" "testing" @@ -403,10 +402,18 @@ func TestBWT_LookupSkipByOffset_PanicOffsetExceedsMinBound(t *testing.T) { func TestBWTRecovery(t *testing.T) { // Test panic recovery for bwtRecovery function - err := fmt.Errorf("test error") + var err error operation := "test operation" + + defer func() { + if err == nil { + t.Fatal("expected bwtRecovery to recover from the panic and set an error message, but got nil") + } + }() defer bwtRecovery(operation, &err) + doPanic() +} +func doPanic() { panic("test panic") - t.Errorf("Expected panic, but it did not occur") }