From 99e280d8ac749ea11442fee2d2c0b6f20dba47a7 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Thu, 9 Nov 2023 22:59:43 -0800 Subject: [PATCH 1/7] Add seqhash v2 --- seqhash/example_test.go | 15 ++- seqhash/seqhash.go | 238 ++++++++++++++++++++++++++++++++++++++-- seqhash/seqhash_test.go | 35 ++++++ 3 files changed, 277 insertions(+), 11 deletions(-) diff --git a/seqhash/example_test.go b/seqhash/example_test.go index 02629c857..13274bfeb 100644 --- a/seqhash/example_test.go +++ b/seqhash/example_test.go @@ -14,9 +14,9 @@ func Example_basic() { circular := false doubleStranded := true - sequenceSeqhash, _ := seqhash.Hash(sequence, sequenceType, circular, doubleStranded) + sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded)) fmt.Println(sequenceSeqhash) - // Output: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350 + // Output: C_JPQCj5PgjFwjy7jaoYmwqQ== } func ExampleHash() { @@ -38,3 +38,14 @@ func ExampleRotateSequence() { fmt.Println(seqhash.RotateSequence(sequence.Sequence) == seqhash.RotateSequence(testSequence)) // output: true } + +func ExampleHash2() { + sequence := "ATGC" + sequenceType := seqhash.DNA + circular := false + doubleStranded := true + + sequenceSeqhash, _ := seqhash.Hash2(sequence, sequenceType, circular, doubleStranded) + fmt.Println(sequenceSeqhash) + // Output: [36 244 2 143 147 224 140 92 35 203 184 218 161 137 176 169] +} diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index 335a92545..8a1b383ad 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -39,7 +39,9 @@ only ACDEFGHIKLMNPQRSTVWYUO*BXZ characters are allowed in sequences. Selenocyste (Pyl; O) are included in the protein character set - usually U and O don't occur within protein sequences, but for certain organisms they do, and it is certainly a relevant amino acid for those particular proteins. -A Seqhash is separated into 3 different elements divided by underscores. It looks like the following: +# Seqhash version 1 + +A version 1 seqhash is separated into 3 different elements divided by underscores. It looks like the following: v1_DCD_4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9 @@ -50,12 +52,38 @@ not the sequence is circular (C for Circular, L for Linear). The final letter co sequence is double stranded (D for Double stranded, S for Single stranded). The final element is the blake3 hash of the sequence (once rotated and complemented, as stated above). -Seqhash is a simple algorithm that allows for much better indexing of genetic sequences than what is -currently available. +# Seqhash version 2 + +Version 1 seqhashes are rather long, and version 2 seqhashes are built to be +much shorter. The intended use case are for handling sequences with LLM systems +since these system's context window is a value resource, and smaller references +allows the system to be more focused. Seqhash version 2 are approximately 3x +smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can +be also encoded with base64 to get a hash that can be used as a string across +different systems. Here is a length comparison: + + version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350 + version 2: C_JPQCj5PgjFwjy7jaoYmwqQ== + +The metadata is now encoded in a 1 byte flag rather than a metadata string, +instead of 7 rune like in version 1. Rather than use 256 bits for encoding +the hash, we use 120 bits. Since seqhashes are not meant for security, this +is good enough (50% collision with 1.3x10^18 hashes), while making them +conveniently only 16 btyes long. Additionally, encoded prefixes are added +to the front of the base64 encoded hash as a heuristic device for LLMs while +processing batches of seqhashes. + +In addition, seqhashes can now encode fragments. Fragments are double stranded +DNA that are the result of restriction digestion, with single stranded +overhangs flanking both sides. These fragments can encode genetic parts - and +an important part of any vector containing these parts would be the part +seqhash, rather than the vector seqhash. This enhancement allows you to +identify genetic parts irregardless of their context. */ package seqhash import ( + "encoding/base64" "encoding/hex" "errors" "sort" @@ -69,9 +97,10 @@ import ( type SequenceType string const ( - DNA SequenceType = "DNA" - RNA SequenceType = "RNA" - PROTEIN SequenceType = "PROTEIN" + DNA SequenceType = "DNA" + RNA SequenceType = "RNA" + PROTEIN SequenceType = "PROTEIN" + FRAGMENT SequenceType = "FRAGMENT" ) // boothLeastRotation gets the least rotation of a circular string. @@ -137,8 +166,11 @@ func RotateSequence(sequence string) string { return sequence } -// Hash is a function to create Seqhashes, a specific kind of identifier. -func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { +// prepareDeterministicSequence prepares input data to be hashed by first running +// all of the checks for sequence typing, then by applying sequence +// manipulations to make a consistent hash for circular and double stranded +// sequences. +func prepareDeterministicSequence(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { // By definition, Seqhashes are of uppercase sequences sequence = strings.ToUpper(sequence) // If RNA, convert to a DNA sequence. The hash itself between a DNA and RNA sequence will not @@ -174,7 +206,6 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran if sequenceType == PROTEIN && doubleStranded { return "", errors.New("Proteins cannot be double stranded") } - // Gets Deterministic sequence based off of metadata + sequence var deterministicSequence string switch { @@ -191,6 +222,15 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran case !circular && !doubleStranded: deterministicSequence = sequence } + return deterministicSequence, nil +} + +// Hash creates a version 1 seqhash. +func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) { + deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded) + if err != nil { + return "", err + } // Build 3 letter metadata var sequenceTypeLetter string @@ -222,3 +262,183 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran seqhash := "v1" + "_" + sequenceTypeLetter + circularLetter + doubleStrandedLetter + "_" + hex.EncodeToString(newhash[:]) return seqhash, nil } + +// The following consts are for seqhash version 2 +const ( + // Define bit masks for each part of the flag + hash2versionMask byte = 0b11110000 // Version occupies the first 4 bits + hash2circularityMask byte = 0b00001000 // Circularity occupies the 5th bit + hash2doubleStrandedMask byte = 0b00000100 // Double-strandedness occupies the 6th bit + hash2typeMask byte = 0b00000011 // DNA/RNA/PROTEIN occupies the last 2 bits + + // Define shift counts for each part + hash2versionShift = 4 + hash2circularityShift = 3 + hash2doubleStrandedShift = 2 + + // Define enum values for DNA/RNA/PROTEIN + hash2DNA byte = 0b00 + hash2RNA byte = 0b01 + hash2PROTEIN byte = 0b10 + // Other is represented by 0b11 +) + +var ( + // sequenceTypeStringToByteFlagMap converts a sequenceType to a byte + sequenceTypeStringToByteFlagMap = map[SequenceType]byte{ + DNA: 0b00, + RNA: 0b01, + PROTEIN: 0b10, + FRAGMENT: 0b11, + } + // sequenceTypeByteToStringFlagMap converts a byte to a sequenceType + sequenceTypeByteToStringFlagMap = map[byte]SequenceType{ + 0b00: DNA, + 0b01: RNA, + 0b10: PROTEIN, + 0b11: FRAGMENT, + } +) + +// EncodeFlag encodes the version, circularity, double-strandedness, and type into a single byte flag. +func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte { + var flag byte + + // Encode the version (assuming version is in the range 0-15) + flag |= (byte(version) << hash2versionShift) + + // Encode the circularity + if circularity { + flag |= (1 << hash2circularityShift) + } + + // Encode the double-strandedness + if doubleStranded { + flag |= (1 << hash2doubleStrandedShift) + } + + // Encode the DNA/RNA/PROTEIN + dnaRnaProtein := sequenceTypeStringToByteFlagMap[sequenceType] + flag |= (dnaRnaProtein & hash2typeMask) + + return flag +} + +// DecodeFlag decodes the single byte flag into its constituent parts. +// Outputs: version, circularity, doubleStranded, dnaRnaProtein +func DecodeFlag(flag byte) (int, SequenceType, bool, bool) { + version := int((flag & hash2versionMask) >> hash2versionShift) + circularity := (flag & hash2circularityMask) != 0 + doubleStranded := (flag & hash2doubleStrandedMask) != 0 + dnaRnaProtein := flag & hash2typeMask + sequenceType := sequenceTypeByteToStringFlagMap[dnaRnaProtein] + + return version, sequenceType, circularity, doubleStranded +} + +// Hash2 creates a version 2 seqhash. +func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) { + var result [16]byte + + // First, get the determistic sequence of the hash + deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded) + if err != nil { + return result, err + } + + // Build our byte flag + flag := EncodeFlag(2, sequenceType, circular, doubleStranded) + result[0] = flag + + // Compute BLAKE3, then copy those to the remaining 15 bytes + newhash := blake3.Sum256([]byte(deterministicSequence)) + copy(result[1:], newhash[:15]) + + return result, nil +} + +// Hash2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are +// a special kind of seqhash that are used to identify fragments, usually +// released by restriction enzyme digestion, rather than complete DNA +// sequences. This is very useful for tracking genetic parts in a database: as +// abstractions away from their container vectors, so that many fragments in +// different vectors can be identified consistently. +// +// fwdOverhangLength and revOverhangLength are the lengths of both overhangs. +// Hashed sequences are hashed with their overhangs attached. Most of the time, +// both of these will equal 4. +// +// threePrimeOverhangFwd and threePrimeOverhangRev are booleans for the +// directionality of the overhangs. The majority of restriction enzymes cut +// leaving 5prime overhangs, so these should nearly always be false, except +// in very special cases. threePrimeOverhangFwd replaces circular (since +// fragments will always be linear) and threePrimeOverhangRev replaces +// doubleStranded (since fragments will always be double stranded). +// +// In order to make sure fwdOverhangLength and revOverhangLength fit in the +// hash, the hash is truncated at 13 bytes rather than 15, and both uint8 are +// inserted. So the bytes would be: +// +// flag + fwdOverhangLength + revOverhangLength + [13]byte(hash) +// +// 13 bytes is considered enough, because the number of fragments is limited +// by our ability to physically produce them, while other other sequence types +// can be found in nature. +func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength uint8, fwdThreePrimeOverhang bool, revThreePrimeOverhang bool) ([16]byte, error) { + var result [16]byte + + // First, run checks and get the determistic sequence of the hash + for _, char := range sequence { + if !strings.Contains("ATUGCYRSWKMBDHVNZ", string(char)) { + return result, errors.New("Only letters ATUGCYRSWKMBDHVNZ are allowed for DNA/RNA. Got letter: " + string(char)) + } + } + potentialSequences := []string{sequence, transform.ReverseComplement(sequence)} + sort.Strings(potentialSequences) + deterministicSequence := potentialSequences[0] + + // Build our byte flag and copy length flags + flag := EncodeFlag(2, FRAGMENT, fwdThreePrimeOverhang, revThreePrimeOverhang) + result[0] = flag + result[1] = fwdOverhangLength + result[2] = revOverhangLength + + // Compute BLAKE3, then copy those to the remaining 13 bytes + newhash := blake3.Sum256([]byte(deterministicSequence)) + copy(result[3:], newhash[:13]) + + return result, nil +} + +type Hash2MetadataKey struct { + SequenceType SequenceType + Circular bool + DoubleStranded bool +} + +var Hash2Metadata = map[Hash2MetadataKey]rune{ + Hash2MetadataKey{DNA, true, true}: 'A', + Hash2MetadataKey{DNA, true, false}: 'B', + Hash2MetadataKey{DNA, false, true}: 'C', + Hash2MetadataKey{DNA, false, false}: 'D', + Hash2MetadataKey{RNA, true, true}: 'E', + Hash2MetadataKey{RNA, true, false}: 'F', + Hash2MetadataKey{RNA, false, true}: 'G', + Hash2MetadataKey{RNA, false, false}: 'H', + Hash2MetadataKey{PROTEIN, false, false}: 'I', + Hash2MetadataKey{PROTEIN, true, false}: 'J', + Hash2MetadataKey{FRAGMENT, false, false}: 'K', + Hash2MetadataKey{FRAGMENT, true, false}: 'L', + Hash2MetadataKey{FRAGMENT, false, true}: 'M', + Hash2MetadataKey{FRAGMENT, true, true}: 'N', +} + +// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single +// letter metadata tag that can be used as an easy heuristic for an LLM to +// identify misbehaving code. +func EncodeHash2(hash [16]byte, err error) (string, error) { + _, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0]) + encoded := base64.StdEncoding.EncodeToString(hash[:]) + + return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err +} diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go index 5787cc36b..5181a593e 100644 --- a/seqhash/seqhash_test.go +++ b/seqhash/seqhash_test.go @@ -89,3 +89,38 @@ func TestLeastRotation(t *testing.T) { } } } + +func TestFlagEncoding(t *testing.T) { + version := 2 + sequenceType := DNA + circularity := true + doubleStranded := true + flag := EncodeFlag(version, sequenceType, circularity, doubleStranded) + decodedVersion, decodedSequenceType, decodedCircularity, decodedDoubleStranded := DecodeFlag(flag) + if (decodedVersion != version) || (decodedSequenceType != sequenceType) || (decodedCircularity != circularity) || (decodedDoubleStranded != doubleStranded) { + t.Errorf("Got different decoded flag.") + } +} + +func TestHash2(t *testing.T) { + // Test TNA as sequenceType + _, err := Hash2("ATGGGCTAA", "TNA", true, true) + if err == nil { + t.Errorf("TestHash2() has failed. TNA is not a valid sequenceType.") + } +} + +func TestHash2Fragment(t *testing.T) { + // Test X failure + _, err := Hash2Fragment("ATGGGCTAX", 4, 4, false, false) + if err == nil { + t.Errorf("TestHash2Fragment() has failed. X is not a valid sequenceType.") + } + // Test actual hash + sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4, false, false)) + expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw==" + if sqHash != expectedHash { + t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) + } + +} From a5820ef105b20f7ce5ccef7d7c26a332d6e6b2db Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Thu, 9 Nov 2023 23:06:46 -0800 Subject: [PATCH 2/7] updated changelog --- CHANGELOG.md | 3 ++- seqhash/seqhash.go | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c2fe0c054..80a40f380 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) - Added a parser and writer for the `pileup` sequence alignment format (#329) +- Added seqhash v2 (#398) ### Fixed - `fastq` parser no longer becomes de-aligned when reading (#325) @@ -19,4 +20,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Oops, we weren't keeping a changelog before this tag! [unreleased]: https://github.com/TimothyStiles/poly/compare/v0.26.0...main -[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 \ No newline at end of file +[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0 diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index 8a1b383ad..e005fa676 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -301,6 +301,7 @@ var ( ) // EncodeFlag encodes the version, circularity, double-strandedness, and type into a single byte flag. +// Used for seqhash v2 func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte { var flag byte @@ -325,7 +326,8 @@ func EncodeFlag(version int, sequenceType SequenceType, circularity bool, double } // DecodeFlag decodes the single byte flag into its constituent parts. -// Outputs: version, circularity, doubleStranded, dnaRnaProtein +// Outputs: version, circularity, doubleStranded, dnaRnaProtein. +// Used for seqhash v2 func DecodeFlag(flag byte) (int, SequenceType, bool, bool) { version := int((flag & hash2versionMask) >> hash2versionShift) circularity := (flag & hash2circularityMask) != 0 @@ -410,12 +412,14 @@ func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength u return result, nil } +// Hash2MetadataKey is a key for a seqhash v2 single letter metadata tag. type Hash2MetadataKey struct { SequenceType SequenceType Circular bool DoubleStranded bool } +// Hash2Metadata contains the seqhash v2 single letter metadata tags. var Hash2Metadata = map[Hash2MetadataKey]rune{ Hash2MetadataKey{DNA, true, true}: 'A', Hash2MetadataKey{DNA, true, false}: 'B', From 36de8b8ae53f05870614bc640f2ded825b814e7b Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Thu, 9 Nov 2023 23:09:19 -0800 Subject: [PATCH 3/7] make linter happy --- seqhash/seqhash.go | 34 ++++++++++++++-------------------- seqhash/seqhash_test.go | 1 - 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index e005fa676..20619edb2 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -275,12 +275,6 @@ const ( hash2versionShift = 4 hash2circularityShift = 3 hash2doubleStrandedShift = 2 - - // Define enum values for DNA/RNA/PROTEIN - hash2DNA byte = 0b00 - hash2RNA byte = 0b01 - hash2PROTEIN byte = 0b10 - // Other is represented by 0b11 ) var ( @@ -421,20 +415,20 @@ type Hash2MetadataKey struct { // Hash2Metadata contains the seqhash v2 single letter metadata tags. var Hash2Metadata = map[Hash2MetadataKey]rune{ - Hash2MetadataKey{DNA, true, true}: 'A', - Hash2MetadataKey{DNA, true, false}: 'B', - Hash2MetadataKey{DNA, false, true}: 'C', - Hash2MetadataKey{DNA, false, false}: 'D', - Hash2MetadataKey{RNA, true, true}: 'E', - Hash2MetadataKey{RNA, true, false}: 'F', - Hash2MetadataKey{RNA, false, true}: 'G', - Hash2MetadataKey{RNA, false, false}: 'H', - Hash2MetadataKey{PROTEIN, false, false}: 'I', - Hash2MetadataKey{PROTEIN, true, false}: 'J', - Hash2MetadataKey{FRAGMENT, false, false}: 'K', - Hash2MetadataKey{FRAGMENT, true, false}: 'L', - Hash2MetadataKey{FRAGMENT, false, true}: 'M', - Hash2MetadataKey{FRAGMENT, true, true}: 'N', + {DNA, true, true}: 'A', + {DNA, true, false}: 'B', + {DNA, false, true}: 'C', + {DNA, false, false}: 'D', + {RNA, true, true}: 'E', + {RNA, true, false}: 'F', + {RNA, false, true}: 'G', + {RNA, false, false}: 'H', + {PROTEIN, false, false}: 'I', + {PROTEIN, true, false}: 'J', + {FRAGMENT, false, false}: 'K', + {FRAGMENT, true, false}: 'L', + {FRAGMENT, false, true}: 'M', + {FRAGMENT, true, true}: 'N', } // EncodeHash2 encodes Hash2 as a base64 string. It also adds a single diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go index 5181a593e..71fd3ea72 100644 --- a/seqhash/seqhash_test.go +++ b/seqhash/seqhash_test.go @@ -122,5 +122,4 @@ func TestHash2Fragment(t *testing.T) { if sqHash != expectedHash { t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) } - } From f716e5a72aa091b0c0eedad8b5bada1eeb57861e Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 17 Nov 2023 11:10:19 -0800 Subject: [PATCH 4/7] seqhash --- seqhash/seqhash.go | 49 +++++++++++++++++++++++++++-------------- seqhash/seqhash_test.go | 4 ++-- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index 20619edb2..7c9fe0ee9 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -362,25 +362,31 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra // // fwdOverhangLength and revOverhangLength are the lengths of both overhangs. // Hashed sequences are hashed with their overhangs attached. Most of the time, -// both of these will equal 4. -// -// threePrimeOverhangFwd and threePrimeOverhangRev are booleans for the -// directionality of the overhangs. The majority of restriction enzymes cut -// leaving 5prime overhangs, so these should nearly always be false, except -// in very special cases. threePrimeOverhangFwd replaces circular (since -// fragments will always be linear) and threePrimeOverhangRev replaces -// doubleStranded (since fragments will always be double stranded). +// both of these will equal 4, as they are released by TypeIIS restriction +// enzymes. // // In order to make sure fwdOverhangLength and revOverhangLength fit in the -// hash, the hash is truncated at 13 bytes rather than 15, and both uint8 are +// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are // inserted. So the bytes would be: // // flag + fwdOverhangLength + revOverhangLength + [13]byte(hash) // +// fwdOverhangLength and revOverhangLength are both int8, and their negatives +// are considered if the the overhang is on the 3prime strand, rather than the +// 5prime strand. +// // 13 bytes is considered enough, because the number of fragments is limited // by our ability to physically produce them, while other other sequence types // can be found in nature. -func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength uint8, fwdThreePrimeOverhang bool, revThreePrimeOverhang bool) ([16]byte, error) { +// +// The fwdOverhang and revOverhang are the lengths of the overhangs of the +// input sequence. The hash, however, contains the forward and reverse overhang +// lengths of the deterministic sequence - ie, the alphabetically less-than +// strand, when comparing the uppercase forward and reverse complement strand. +// This means if the input sequence is not less than its reverse complement (for +// example, GTT is greater than AAC), then the output hash will have the forward +// and reverse overhang lengths of the reverse complement, not the input strand. +func Hash2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) { var result [16]byte // First, run checks and get the determistic sequence of the hash @@ -389,15 +395,26 @@ func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength u return result, errors.New("Only letters ATUGCYRSWKMBDHVNZ are allowed for DNA/RNA. Got letter: " + string(char)) } } - potentialSequences := []string{sequence, transform.ReverseComplement(sequence)} - sort.Strings(potentialSequences) - deterministicSequence := potentialSequences[0] + sequence = strings.ToUpper(sequence) + var rev, fwd int8 + var deterministicSequence string + reverseComplement := transform.ReverseComplement(sequence) + if sequence > reverseComplement { + // If the reverse complement is smaller, reverse the overhangs fwd and rev + rev = fwdOverhangLength + fwd = revOverhangLength + deterministicSequence = reverseComplement + } else { + fwd = fwdOverhangLength + rev = revOverhangLength + deterministicSequence = sequence + } // Build our byte flag and copy length flags - flag := EncodeFlag(2, FRAGMENT, fwdThreePrimeOverhang, revThreePrimeOverhang) + flag := EncodeFlag(2, FRAGMENT, false, false) result[0] = flag - result[1] = fwdOverhangLength - result[2] = revOverhangLength + result[1] = byte(fwd) + result[2] = byte(rev) // Compute BLAKE3, then copy those to the remaining 13 bytes newhash := blake3.Sum256([]byte(deterministicSequence)) diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go index 71fd3ea72..352f30c9c 100644 --- a/seqhash/seqhash_test.go +++ b/seqhash/seqhash_test.go @@ -112,12 +112,12 @@ func TestHash2(t *testing.T) { func TestHash2Fragment(t *testing.T) { // Test X failure - _, err := Hash2Fragment("ATGGGCTAX", 4, 4, false, false) + _, err := Hash2Fragment("ATGGGCTAX", 4, 4) if err == nil { t.Errorf("TestHash2Fragment() has failed. X is not a valid sequenceType.") } // Test actual hash - sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4, false, false)) + sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4)) expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw==" if sqHash != expectedHash { t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) From f12dba0a590c0bb7cf7f5402f988d5b40a69573d Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Wed, 29 Nov 2023 09:55:40 -0800 Subject: [PATCH 5/7] Hash2->HashV2 --- seqhash/example_test.go | 6 +++--- seqhash/seqhash.go | 22 +++++++++++----------- seqhash/seqhash_test.go | 14 +++++++------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/seqhash/example_test.go b/seqhash/example_test.go index 13274bfeb..a19679c98 100644 --- a/seqhash/example_test.go +++ b/seqhash/example_test.go @@ -14,7 +14,7 @@ func Example_basic() { circular := false doubleStranded := true - sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded)) + sequenceSeqhash, _ := seqhash.EncodeHashV2(seqhash.HashV2(sequence, sequenceType, circular, doubleStranded)) fmt.Println(sequenceSeqhash) // Output: C_JPQCj5PgjFwjy7jaoYmwqQ== } @@ -39,13 +39,13 @@ func ExampleRotateSequence() { // output: true } -func ExampleHash2() { +func ExampleHashV2() { sequence := "ATGC" sequenceType := seqhash.DNA circular := false doubleStranded := true - sequenceSeqhash, _ := seqhash.Hash2(sequence, sequenceType, circular, doubleStranded) + sequenceSeqhash, _ := seqhash.HashV2(sequence, sequenceType, circular, doubleStranded) fmt.Println(sequenceSeqhash) // Output: [36 244 2 143 147 224 140 92 35 203 184 218 161 137 176 169] } diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index 7c9fe0ee9..21daa9e73 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -332,8 +332,8 @@ func DecodeFlag(flag byte) (int, SequenceType, bool, bool) { return version, sequenceType, circularity, doubleStranded } -// Hash2 creates a version 2 seqhash. -func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) { +// HashV2 creates a version 2 seqhash. +func HashV2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) { var result [16]byte // First, get the determistic sequence of the hash @@ -353,7 +353,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra return result, nil } -// Hash2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are +// HashV2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are // a special kind of seqhash that are used to identify fragments, usually // released by restriction enzyme digestion, rather than complete DNA // sequences. This is very useful for tracking genetic parts in a database: as @@ -386,7 +386,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra // This means if the input sequence is not less than its reverse complement (for // example, GTT is greater than AAC), then the output hash will have the forward // and reverse overhang lengths of the reverse complement, not the input strand. -func Hash2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) { +func HashV2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) { var result [16]byte // First, run checks and get the determistic sequence of the hash @@ -423,15 +423,15 @@ func Hash2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength in return result, nil } -// Hash2MetadataKey is a key for a seqhash v2 single letter metadata tag. -type Hash2MetadataKey struct { +// HashV2MetadataKey is a key for a seqhash v2 single letter metadata tag. +type HashV2MetadataKey struct { SequenceType SequenceType Circular bool DoubleStranded bool } -// Hash2Metadata contains the seqhash v2 single letter metadata tags. -var Hash2Metadata = map[Hash2MetadataKey]rune{ +// HashV2Metadata contains the seqhash v2 single letter metadata tags. +var HashV2Metadata = map[HashV2MetadataKey]rune{ {DNA, true, true}: 'A', {DNA, true, false}: 'B', {DNA, false, true}: 'C', @@ -448,12 +448,12 @@ var Hash2Metadata = map[Hash2MetadataKey]rune{ {FRAGMENT, true, true}: 'N', } -// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single +// EncodeHashV2 encodes HashV2 as a base64 string. It also adds a single // letter metadata tag that can be used as an easy heuristic for an LLM to // identify misbehaving code. -func EncodeHash2(hash [16]byte, err error) (string, error) { +func EncodeHashV2(hash [16]byte, err error) (string, error) { _, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0]) encoded := base64.StdEncoding.EncodeToString(hash[:]) - return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err + return string(HashV2Metadata[HashV2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err } diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go index 352f30c9c..f4da6648d 100644 --- a/seqhash/seqhash_test.go +++ b/seqhash/seqhash_test.go @@ -102,22 +102,22 @@ func TestFlagEncoding(t *testing.T) { } } -func TestHash2(t *testing.T) { +func TestHashV2(t *testing.T) { // Test TNA as sequenceType - _, err := Hash2("ATGGGCTAA", "TNA", true, true) + _, err := HashV2("ATGGGCTAA", "TNA", true, true) if err == nil { - t.Errorf("TestHash2() has failed. TNA is not a valid sequenceType.") + t.Errorf("TestHashV2() has failed. TNA is not a valid sequenceType.") } } -func TestHash2Fragment(t *testing.T) { +func TestHashV2Fragment(t *testing.T) { // Test X failure - _, err := Hash2Fragment("ATGGGCTAX", 4, 4) + _, err := HashV2Fragment("ATGGGCTAX", 4, 4) if err == nil { - t.Errorf("TestHash2Fragment() has failed. X is not a valid sequenceType.") + t.Errorf("TestHashV2Fragment() has failed. X is not a valid sequenceType.") } // Test actual hash - sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4)) + sqHash, _ := EncodeHashV2(HashV2Fragment("ATGGGCTAA", 4, 4)) expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw==" if sqHash != expectedHash { t.Errorf("Expected %s, Got: %s", expectedHash, sqHash) From 9ae38fab9b8a93af7d16d96901378c48e2505cd5 Mon Sep 17 00:00:00 2001 From: Timothy Stiles Date: Wed, 29 Nov 2023 12:45:06 -0800 Subject: [PATCH 6/7] renamed fwd and rev --- seqhash/seqhash.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index 21daa9e73..f4ec1a82e 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -396,25 +396,25 @@ func HashV2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength i } } sequence = strings.ToUpper(sequence) - var rev, fwd int8 + var forward, reverse int8 var deterministicSequence string reverseComplement := transform.ReverseComplement(sequence) if sequence > reverseComplement { - // If the reverse complement is smaller, reverse the overhangs fwd and rev - rev = fwdOverhangLength - fwd = revOverhangLength + // If the reverse complement is smaller, reverse the overhangs forward and reverse + forward = revOverhangLength + reverse = fwdOverhangLength deterministicSequence = reverseComplement } else { - fwd = fwdOverhangLength - rev = revOverhangLength + forward = fwdOverhangLength + reverse = revOverhangLength deterministicSequence = sequence } // Build our byte flag and copy length flags flag := EncodeFlag(2, FRAGMENT, false, false) result[0] = flag - result[1] = byte(fwd) - result[2] = byte(rev) + result[1] = byte(forward) + result[2] = byte(reverse) // Compute BLAKE3, then copy those to the remaining 13 bytes newhash := blake3.Sum256([]byte(deterministicSequence)) From 44e9cced105e857ce8266558ebb01817f05da69a Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Wed, 6 Dec 2023 08:26:31 -0800 Subject: [PATCH 7/7] Add top level comment --- seqhash/seqhash.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go index f4ec1a82e..5613ff0a5 100644 --- a/seqhash/seqhash.go +++ b/seqhash/seqhash.go @@ -3,6 +3,9 @@ Package seqhash contains the seqhash algorithm. This package contains the reference seqhash algorithm. +If you are new to using seqhash, use V2. V1 should only be used in situations +where full 256 rather than 120 bit hashing is needed. + There is a big problem with current sequence databases - they all use different identifiers and accession numbers. This means cross-referencing databases is a complicated exercise, especially as the quantity of databases increases, or if