From 99e280d8ac749ea11442fee2d2c0b6f20dba47a7 Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Thu, 9 Nov 2023 22:59:43 -0800
Subject: [PATCH 1/7] Add seqhash v2

---
 seqhash/example_test.go |  15 ++-
 seqhash/seqhash.go      | 238 ++++++++++++++++++++++++++++++++++++++--
 seqhash/seqhash_test.go |  35 ++++++
 3 files changed, 277 insertions(+), 11 deletions(-)

diff --git a/seqhash/example_test.go b/seqhash/example_test.go
index 02629c857..13274bfeb 100644
--- a/seqhash/example_test.go
+++ b/seqhash/example_test.go
@@ -14,9 +14,9 @@ func Example_basic() {
 	circular := false
 	doubleStranded := true
 
-	sequenceSeqhash, _ := seqhash.Hash(sequence, sequenceType, circular, doubleStranded)
+	sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded))
 	fmt.Println(sequenceSeqhash)
-	// Output: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350
+	// Output: C_JPQCj5PgjFwjy7jaoYmwqQ==
 }
 
 func ExampleHash() {
@@ -38,3 +38,14 @@ func ExampleRotateSequence() {
 	fmt.Println(seqhash.RotateSequence(sequence.Sequence) == seqhash.RotateSequence(testSequence))
 	// output: true
 }
+
+func ExampleHash2() {
+	sequence := "ATGC"
+	sequenceType := seqhash.DNA
+	circular := false
+	doubleStranded := true
+
+	sequenceSeqhash, _ := seqhash.Hash2(sequence, sequenceType, circular, doubleStranded)
+	fmt.Println(sequenceSeqhash)
+	// Output: [36 244 2 143 147 224 140 92 35 203 184 218 161 137 176 169]
+}
diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
index 335a92545..8a1b383ad 100644
--- a/seqhash/seqhash.go
+++ b/seqhash/seqhash.go
@@ -39,7 +39,9 @@ only ACDEFGHIKLMNPQRSTVWYUO*BXZ characters are allowed in sequences. Selenocyste
 (Pyl; O) are included in the protein character set - usually U and O don't occur within protein sequences,
 but for certain organisms they do, and it is certainly a relevant amino acid for those particular proteins.
 
-A Seqhash is separated into 3 different elements divided by underscores. It looks like the following:
+# Seqhash version 1
+
+A version 1 seqhash is separated into 3 different elements divided by underscores. It looks like the following:
 
 v1_DCD_4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9
 
@@ -50,12 +52,38 @@ not the sequence is circular (C for Circular, L for Linear). The final letter co
 sequence is double stranded (D for Double stranded, S for Single stranded). The final element is the blake3
 hash of the sequence (once rotated and complemented, as stated above).
 
-Seqhash is a simple algorithm that allows for much better indexing of genetic sequences than what is
-currently available.
+# Seqhash version 2
+
+Version 1 seqhashes are rather long, and version 2 seqhashes are built to be
+much shorter. The intended use case are for handling sequences with LLM systems
+since these system's context window is a value resource, and smaller references
+allows the system to be more focused. Seqhash version 2 are approximately 3x
+smaller than version 1 seqhashes. Officially, they are [16]byte arrays, but can
+be also encoded with base64 to get a hash that can be used as a string across
+different systems. Here is a length comparison:
+
+	version 1: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350
+	version 2: C_JPQCj5PgjFwjy7jaoYmwqQ==
+
+The metadata is now encoded in a 1 byte flag rather than a metadata string,
+instead of 7 rune like in version 1. Rather than use 256 bits for encoding
+the hash, we use 120 bits. Since seqhashes are not meant for security, this
+is good enough (50% collision with 1.3x10^18 hashes), while making them
+conveniently only 16 btyes long. Additionally, encoded prefixes are added
+to the front of the base64 encoded hash as a heuristic device for LLMs while
+processing batches of seqhashes.
+
+In addition, seqhashes can now encode fragments. Fragments are double stranded
+DNA that are the result of restriction digestion, with single stranded
+overhangs flanking both sides. These fragments can encode genetic parts - and
+an important part of any vector containing these parts would be the part
+seqhash, rather than the vector seqhash. This enhancement allows you to
+identify genetic parts irregardless of their context.
 */
 package seqhash
 
 import (
+	"encoding/base64"
 	"encoding/hex"
 	"errors"
 	"sort"
@@ -69,9 +97,10 @@ import (
 type SequenceType string
 
 const (
-	DNA     SequenceType = "DNA"
-	RNA     SequenceType = "RNA"
-	PROTEIN SequenceType = "PROTEIN"
+	DNA      SequenceType = "DNA"
+	RNA      SequenceType = "RNA"
+	PROTEIN  SequenceType = "PROTEIN"
+	FRAGMENT SequenceType = "FRAGMENT"
 )
 
 // boothLeastRotation gets the least rotation of a circular string.
@@ -137,8 +166,11 @@ func RotateSequence(sequence string) string {
 	return sequence
 }
 
-// Hash is a function to create Seqhashes, a specific kind of identifier.
-func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) {
+// prepareDeterministicSequence prepares input data to be hashed by first running
+// all of the checks for sequence typing, then by applying sequence
+// manipulations to make a consistent hash for circular and double stranded
+// sequences.
+func prepareDeterministicSequence(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) {
 	// By definition, Seqhashes are of uppercase sequences
 	sequence = strings.ToUpper(sequence)
 	// If RNA, convert to a DNA sequence. The hash itself between a DNA and RNA sequence will not
@@ -174,7 +206,6 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran
 	if sequenceType == PROTEIN && doubleStranded {
 		return "", errors.New("Proteins cannot be double stranded")
 	}
-
 	// Gets Deterministic sequence based off of metadata + sequence
 	var deterministicSequence string
 	switch {
@@ -191,6 +222,15 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran
 	case !circular && !doubleStranded:
 		deterministicSequence = sequence
 	}
+	return deterministicSequence, nil
+}
+
+// Hash creates a version 1 seqhash.
+func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) (string, error) {
+	deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded)
+	if err != nil {
+		return "", err
+	}
 
 	// Build 3 letter metadata
 	var sequenceTypeLetter string
@@ -222,3 +262,183 @@ func Hash(sequence string, sequenceType SequenceType, circular bool, doubleStran
 	seqhash := "v1" + "_" + sequenceTypeLetter + circularLetter + doubleStrandedLetter + "_" + hex.EncodeToString(newhash[:])
 	return seqhash, nil
 }
+
+// The following consts are for seqhash version 2
+const (
+	// Define bit masks for each part of the flag
+	hash2versionMask        byte = 0b11110000 // Version occupies the first 4 bits
+	hash2circularityMask    byte = 0b00001000 // Circularity occupies the 5th bit
+	hash2doubleStrandedMask byte = 0b00000100 // Double-strandedness occupies the 6th bit
+	hash2typeMask           byte = 0b00000011 // DNA/RNA/PROTEIN occupies the last 2 bits
+
+	// Define shift counts for each part
+	hash2versionShift        = 4
+	hash2circularityShift    = 3
+	hash2doubleStrandedShift = 2
+
+	// Define enum values for DNA/RNA/PROTEIN
+	hash2DNA     byte = 0b00
+	hash2RNA     byte = 0b01
+	hash2PROTEIN byte = 0b10
+	// Other is represented by 0b11
+)
+
+var (
+	// sequenceTypeStringToByteFlagMap converts a sequenceType to a byte
+	sequenceTypeStringToByteFlagMap = map[SequenceType]byte{
+		DNA:      0b00,
+		RNA:      0b01,
+		PROTEIN:  0b10,
+		FRAGMENT: 0b11,
+	}
+	// sequenceTypeByteToStringFlagMap converts a byte to a sequenceType
+	sequenceTypeByteToStringFlagMap = map[byte]SequenceType{
+		0b00: DNA,
+		0b01: RNA,
+		0b10: PROTEIN,
+		0b11: FRAGMENT,
+	}
+)
+
+// EncodeFlag encodes the version, circularity, double-strandedness, and type into a single byte flag.
+func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte {
+	var flag byte
+
+	// Encode the version (assuming version is in the range 0-15)
+	flag |= (byte(version) << hash2versionShift)
+
+	// Encode the circularity
+	if circularity {
+		flag |= (1 << hash2circularityShift)
+	}
+
+	// Encode the double-strandedness
+	if doubleStranded {
+		flag |= (1 << hash2doubleStrandedShift)
+	}
+
+	// Encode the DNA/RNA/PROTEIN
+	dnaRnaProtein := sequenceTypeStringToByteFlagMap[sequenceType]
+	flag |= (dnaRnaProtein & hash2typeMask)
+
+	return flag
+}
+
+// DecodeFlag decodes the single byte flag into its constituent parts.
+// Outputs: version, circularity, doubleStranded, dnaRnaProtein
+func DecodeFlag(flag byte) (int, SequenceType, bool, bool) {
+	version := int((flag & hash2versionMask) >> hash2versionShift)
+	circularity := (flag & hash2circularityMask) != 0
+	doubleStranded := (flag & hash2doubleStrandedMask) != 0
+	dnaRnaProtein := flag & hash2typeMask
+	sequenceType := sequenceTypeByteToStringFlagMap[dnaRnaProtein]
+
+	return version, sequenceType, circularity, doubleStranded
+}
+
+// Hash2 creates a version 2 seqhash.
+func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) {
+	var result [16]byte
+
+	// First, get the determistic sequence of the hash
+	deterministicSequence, err := prepareDeterministicSequence(sequence, sequenceType, circular, doubleStranded)
+	if err != nil {
+		return result, err
+	}
+
+	// Build our byte flag
+	flag := EncodeFlag(2, sequenceType, circular, doubleStranded)
+	result[0] = flag
+
+	// Compute BLAKE3, then copy those to the remaining 15 bytes
+	newhash := blake3.Sum256([]byte(deterministicSequence))
+	copy(result[1:], newhash[:15])
+
+	return result, nil
+}
+
+// Hash2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are
+// a special kind of seqhash that are used to identify fragments, usually
+// released by restriction enzyme digestion, rather than complete DNA
+// sequences. This is very useful for tracking genetic parts in a database: as
+// abstractions away from their container vectors, so that many fragments in
+// different vectors can be identified consistently.
+//
+// fwdOverhangLength and revOverhangLength are the lengths of both overhangs.
+// Hashed sequences are hashed with their overhangs attached. Most of the time,
+// both of these will equal 4.
+//
+// threePrimeOverhangFwd and threePrimeOverhangRev are booleans for the
+// directionality of the overhangs. The majority of restriction enzymes cut
+// leaving 5prime overhangs, so these should nearly always be false, except
+// in very special cases. threePrimeOverhangFwd replaces circular (since
+// fragments will always be linear) and threePrimeOverhangRev replaces
+// doubleStranded (since fragments will always be double stranded).
+//
+// In order to make sure fwdOverhangLength and revOverhangLength fit in the
+// hash, the hash is truncated at 13 bytes rather than 15, and both uint8 are
+// inserted. So the bytes would be:
+//
+//	flag + fwdOverhangLength + revOverhangLength + [13]byte(hash)
+//
+// 13 bytes is considered enough, because the number of fragments is limited
+// by our ability to physically produce them, while other other sequence types
+// can be found in nature.
+func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength uint8, fwdThreePrimeOverhang bool, revThreePrimeOverhang bool) ([16]byte, error) {
+	var result [16]byte
+
+	// First, run checks and get the determistic sequence of the hash
+	for _, char := range sequence {
+		if !strings.Contains("ATUGCYRSWKMBDHVNZ", string(char)) {
+			return result, errors.New("Only letters ATUGCYRSWKMBDHVNZ are allowed for DNA/RNA. Got letter: " + string(char))
+		}
+	}
+	potentialSequences := []string{sequence, transform.ReverseComplement(sequence)}
+	sort.Strings(potentialSequences)
+	deterministicSequence := potentialSequences[0]
+
+	// Build our byte flag and copy length flags
+	flag := EncodeFlag(2, FRAGMENT, fwdThreePrimeOverhang, revThreePrimeOverhang)
+	result[0] = flag
+	result[1] = fwdOverhangLength
+	result[2] = revOverhangLength
+
+	// Compute BLAKE3, then copy those to the remaining 13 bytes
+	newhash := blake3.Sum256([]byte(deterministicSequence))
+	copy(result[3:], newhash[:13])
+
+	return result, nil
+}
+
+type Hash2MetadataKey struct {
+	SequenceType   SequenceType
+	Circular       bool
+	DoubleStranded bool
+}
+
+var Hash2Metadata = map[Hash2MetadataKey]rune{
+	Hash2MetadataKey{DNA, true, true}:        'A',
+	Hash2MetadataKey{DNA, true, false}:       'B',
+	Hash2MetadataKey{DNA, false, true}:       'C',
+	Hash2MetadataKey{DNA, false, false}:      'D',
+	Hash2MetadataKey{RNA, true, true}:        'E',
+	Hash2MetadataKey{RNA, true, false}:       'F',
+	Hash2MetadataKey{RNA, false, true}:       'G',
+	Hash2MetadataKey{RNA, false, false}:      'H',
+	Hash2MetadataKey{PROTEIN, false, false}:  'I',
+	Hash2MetadataKey{PROTEIN, true, false}:   'J',
+	Hash2MetadataKey{FRAGMENT, false, false}: 'K',
+	Hash2MetadataKey{FRAGMENT, true, false}:  'L',
+	Hash2MetadataKey{FRAGMENT, false, true}:  'M',
+	Hash2MetadataKey{FRAGMENT, true, true}:   'N',
+}
+
+// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single
+// letter metadata tag that can be used as an easy heuristic for an LLM to
+// identify misbehaving code.
+func EncodeHash2(hash [16]byte, err error) (string, error) {
+	_, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0])
+	encoded := base64.StdEncoding.EncodeToString(hash[:])
+
+	return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err
+}
diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go
index 5787cc36b..5181a593e 100644
--- a/seqhash/seqhash_test.go
+++ b/seqhash/seqhash_test.go
@@ -89,3 +89,38 @@ func TestLeastRotation(t *testing.T) {
 		}
 	}
 }
+
+func TestFlagEncoding(t *testing.T) {
+	version := 2
+	sequenceType := DNA
+	circularity := true
+	doubleStranded := true
+	flag := EncodeFlag(version, sequenceType, circularity, doubleStranded)
+	decodedVersion, decodedSequenceType, decodedCircularity, decodedDoubleStranded := DecodeFlag(flag)
+	if (decodedVersion != version) || (decodedSequenceType != sequenceType) || (decodedCircularity != circularity) || (decodedDoubleStranded != doubleStranded) {
+		t.Errorf("Got different decoded flag.")
+	}
+}
+
+func TestHash2(t *testing.T) {
+	// Test TNA as sequenceType
+	_, err := Hash2("ATGGGCTAA", "TNA", true, true)
+	if err == nil {
+		t.Errorf("TestHash2() has failed. TNA is not a valid sequenceType.")
+	}
+}
+
+func TestHash2Fragment(t *testing.T) {
+	// Test X failure
+	_, err := Hash2Fragment("ATGGGCTAX", 4, 4, false, false)
+	if err == nil {
+		t.Errorf("TestHash2Fragment() has failed. X is not a valid sequenceType.")
+	}
+	// Test actual hash
+	sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4, false, false))
+	expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw=="
+	if sqHash != expectedHash {
+		t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
+	}
+
+}

From a5820ef105b20f7ce5ccef7d7c26a332d6e6b2db Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Thu, 9 Nov 2023 23:06:46 -0800
Subject: [PATCH 2/7] updated changelog

---
 CHANGELOG.md       | 3 ++-
 seqhash/seqhash.go | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c2fe0c054..80a40f380 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305)
 - Added a parser and writer for the `pileup` sequence alignment format (#329)
+- Added seqhash v2 (#398)
 
 ### Fixed
 - `fastq` parser no longer becomes de-aligned when reading (#325)
@@ -19,4 +20,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Oops, we weren't keeping a changelog before this tag!
 
 [unreleased]: https://github.com/TimothyStiles/poly/compare/v0.26.0...main
-[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0
\ No newline at end of file
+[0.26.0]: https://github.com/TimothyStiles/poly/releases/tag/v0.26.0
diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
index 8a1b383ad..e005fa676 100644
--- a/seqhash/seqhash.go
+++ b/seqhash/seqhash.go
@@ -301,6 +301,7 @@ var (
 )
 
 // EncodeFlag encodes the version, circularity, double-strandedness, and type into a single byte flag.
+// Used for seqhash v2
 func EncodeFlag(version int, sequenceType SequenceType, circularity bool, doubleStranded bool) byte {
 	var flag byte
 
@@ -325,7 +326,8 @@ func EncodeFlag(version int, sequenceType SequenceType, circularity bool, double
 }
 
 // DecodeFlag decodes the single byte flag into its constituent parts.
-// Outputs: version, circularity, doubleStranded, dnaRnaProtein
+// Outputs: version, circularity, doubleStranded, dnaRnaProtein.
+// Used for seqhash v2
 func DecodeFlag(flag byte) (int, SequenceType, bool, bool) {
 	version := int((flag & hash2versionMask) >> hash2versionShift)
 	circularity := (flag & hash2circularityMask) != 0
@@ -410,12 +412,14 @@ func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength u
 	return result, nil
 }
 
+// Hash2MetadataKey is a key for a seqhash v2 single letter metadata tag.
 type Hash2MetadataKey struct {
 	SequenceType   SequenceType
 	Circular       bool
 	DoubleStranded bool
 }
 
+// Hash2Metadata contains the seqhash v2 single letter metadata tags.
 var Hash2Metadata = map[Hash2MetadataKey]rune{
 	Hash2MetadataKey{DNA, true, true}:        'A',
 	Hash2MetadataKey{DNA, true, false}:       'B',

From 36de8b8ae53f05870614bc640f2ded825b814e7b Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Thu, 9 Nov 2023 23:09:19 -0800
Subject: [PATCH 3/7] make linter happy

---
 seqhash/seqhash.go      | 34 ++++++++++++++--------------------
 seqhash/seqhash_test.go |  1 -
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
index e005fa676..20619edb2 100644
--- a/seqhash/seqhash.go
+++ b/seqhash/seqhash.go
@@ -275,12 +275,6 @@ const (
 	hash2versionShift        = 4
 	hash2circularityShift    = 3
 	hash2doubleStrandedShift = 2
-
-	// Define enum values for DNA/RNA/PROTEIN
-	hash2DNA     byte = 0b00
-	hash2RNA     byte = 0b01
-	hash2PROTEIN byte = 0b10
-	// Other is represented by 0b11
 )
 
 var (
@@ -421,20 +415,20 @@ type Hash2MetadataKey struct {
 
 // Hash2Metadata contains the seqhash v2 single letter metadata tags.
 var Hash2Metadata = map[Hash2MetadataKey]rune{
-	Hash2MetadataKey{DNA, true, true}:        'A',
-	Hash2MetadataKey{DNA, true, false}:       'B',
-	Hash2MetadataKey{DNA, false, true}:       'C',
-	Hash2MetadataKey{DNA, false, false}:      'D',
-	Hash2MetadataKey{RNA, true, true}:        'E',
-	Hash2MetadataKey{RNA, true, false}:       'F',
-	Hash2MetadataKey{RNA, false, true}:       'G',
-	Hash2MetadataKey{RNA, false, false}:      'H',
-	Hash2MetadataKey{PROTEIN, false, false}:  'I',
-	Hash2MetadataKey{PROTEIN, true, false}:   'J',
-	Hash2MetadataKey{FRAGMENT, false, false}: 'K',
-	Hash2MetadataKey{FRAGMENT, true, false}:  'L',
-	Hash2MetadataKey{FRAGMENT, false, true}:  'M',
-	Hash2MetadataKey{FRAGMENT, true, true}:   'N',
+	{DNA, true, true}:        'A',
+	{DNA, true, false}:       'B',
+	{DNA, false, true}:       'C',
+	{DNA, false, false}:      'D',
+	{RNA, true, true}:        'E',
+	{RNA, true, false}:       'F',
+	{RNA, false, true}:       'G',
+	{RNA, false, false}:      'H',
+	{PROTEIN, false, false}:  'I',
+	{PROTEIN, true, false}:   'J',
+	{FRAGMENT, false, false}: 'K',
+	{FRAGMENT, true, false}:  'L',
+	{FRAGMENT, false, true}:  'M',
+	{FRAGMENT, true, true}:   'N',
 }
 
 // EncodeHash2 encodes Hash2 as a base64 string. It also adds a single
diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go
index 5181a593e..71fd3ea72 100644
--- a/seqhash/seqhash_test.go
+++ b/seqhash/seqhash_test.go
@@ -122,5 +122,4 @@ func TestHash2Fragment(t *testing.T) {
 	if sqHash != expectedHash {
 		t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)
 	}
-
 }

From f716e5a72aa091b0c0eedad8b5bada1eeb57861e Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Fri, 17 Nov 2023 11:10:19 -0800
Subject: [PATCH 4/7] seqhash

---
 seqhash/seqhash.go      | 49 +++++++++++++++++++++++++++--------------
 seqhash/seqhash_test.go |  4 ++--
 2 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
index 20619edb2..7c9fe0ee9 100644
--- a/seqhash/seqhash.go
+++ b/seqhash/seqhash.go
@@ -362,25 +362,31 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
 //
 // fwdOverhangLength and revOverhangLength are the lengths of both overhangs.
 // Hashed sequences are hashed with their overhangs attached. Most of the time,
-// both of these will equal 4.
-//
-// threePrimeOverhangFwd and threePrimeOverhangRev are booleans for the
-// directionality of the overhangs. The majority of restriction enzymes cut
-// leaving 5prime overhangs, so these should nearly always be false, except
-// in very special cases. threePrimeOverhangFwd replaces circular (since
-// fragments will always be linear) and threePrimeOverhangRev replaces
-// doubleStranded (since fragments will always be double stranded).
+// both of these will equal 4, as they are released by TypeIIS restriction
+// enzymes.
 //
 // In order to make sure fwdOverhangLength and revOverhangLength fit in the
-// hash, the hash is truncated at 13 bytes rather than 15, and both uint8 are
+// hash, the hash is truncated at 13 bytes rather than 15, and both int8 are
 // inserted. So the bytes would be:
 //
 //	flag + fwdOverhangLength + revOverhangLength + [13]byte(hash)
 //
+// fwdOverhangLength and revOverhangLength are both int8, and their negatives
+// are considered if the the overhang is on the 3prime strand, rather than the
+// 5prime strand.
+//
 // 13 bytes is considered enough, because the number of fragments is limited
 // by our ability to physically produce them, while other other sequence types
 // can be found in nature.
-func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength uint8, fwdThreePrimeOverhang bool, revThreePrimeOverhang bool) ([16]byte, error) {
+//
+// The fwdOverhang and revOverhang are the lengths of the overhangs of the
+// input sequence. The hash, however, contains the forward and reverse overhang
+// lengths of the deterministic sequence - ie, the alphabetically less-than
+// strand, when comparing the uppercase forward and reverse complement strand.
+// This means if the input sequence is not less than its reverse complement (for
+// example, GTT is greater than AAC), then the output hash will have the forward
+// and reverse overhang lengths of the reverse complement, not the input strand.
+func Hash2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) {
 	var result [16]byte
 
 	// First, run checks and get the determistic sequence of the hash
@@ -389,15 +395,26 @@ func Hash2Fragment(sequence string, fwdOverhangLength uint8, revOverhangLength u
 			return result, errors.New("Only letters ATUGCYRSWKMBDHVNZ are allowed for DNA/RNA. Got letter: " + string(char))
 		}
 	}
-	potentialSequences := []string{sequence, transform.ReverseComplement(sequence)}
-	sort.Strings(potentialSequences)
-	deterministicSequence := potentialSequences[0]
+	sequence = strings.ToUpper(sequence)
+	var rev, fwd int8
+	var deterministicSequence string
+	reverseComplement := transform.ReverseComplement(sequence)
+	if sequence > reverseComplement {
+		// If the reverse complement is smaller, reverse the overhangs fwd and rev
+		rev = fwdOverhangLength
+		fwd = revOverhangLength
+		deterministicSequence = reverseComplement
+	} else {
+		fwd = fwdOverhangLength
+		rev = revOverhangLength
+		deterministicSequence = sequence
+	}
 
 	// Build our byte flag and copy length flags
-	flag := EncodeFlag(2, FRAGMENT, fwdThreePrimeOverhang, revThreePrimeOverhang)
+	flag := EncodeFlag(2, FRAGMENT, false, false)
 	result[0] = flag
-	result[1] = fwdOverhangLength
-	result[2] = revOverhangLength
+	result[1] = byte(fwd)
+	result[2] = byte(rev)
 
 	// Compute BLAKE3, then copy those to the remaining 13 bytes
 	newhash := blake3.Sum256([]byte(deterministicSequence))
diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go
index 71fd3ea72..352f30c9c 100644
--- a/seqhash/seqhash_test.go
+++ b/seqhash/seqhash_test.go
@@ -112,12 +112,12 @@ func TestHash2(t *testing.T) {
 
 func TestHash2Fragment(t *testing.T) {
 	// Test X failure
-	_, err := Hash2Fragment("ATGGGCTAX", 4, 4, false, false)
+	_, err := Hash2Fragment("ATGGGCTAX", 4, 4)
 	if err == nil {
 		t.Errorf("TestHash2Fragment() has failed. X is not a valid sequenceType.")
 	}
 	// Test actual hash
-	sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4, false, false))
+	sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4))
 	expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw=="
 	if sqHash != expectedHash {
 		t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)

From f12dba0a590c0bb7cf7f5402f988d5b40a69573d Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Wed, 29 Nov 2023 09:55:40 -0800
Subject: [PATCH 5/7] Hash2->HashV2

---
 seqhash/example_test.go |  6 +++---
 seqhash/seqhash.go      | 22 +++++++++++-----------
 seqhash/seqhash_test.go | 14 +++++++-------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/seqhash/example_test.go b/seqhash/example_test.go
index 13274bfeb..a19679c98 100644
--- a/seqhash/example_test.go
+++ b/seqhash/example_test.go
@@ -14,7 +14,7 @@ func Example_basic() {
 	circular := false
 	doubleStranded := true
 
-	sequenceSeqhash, _ := seqhash.EncodeHash2(seqhash.Hash2(sequence, sequenceType, circular, doubleStranded))
+	sequenceSeqhash, _ := seqhash.EncodeHashV2(seqhash.HashV2(sequence, sequenceType, circular, doubleStranded))
 	fmt.Println(sequenceSeqhash)
 	// Output: C_JPQCj5PgjFwjy7jaoYmwqQ==
 }
@@ -39,13 +39,13 @@ func ExampleRotateSequence() {
 	// output: true
 }
 
-func ExampleHash2() {
+func ExampleHashV2() {
 	sequence := "ATGC"
 	sequenceType := seqhash.DNA
 	circular := false
 	doubleStranded := true
 
-	sequenceSeqhash, _ := seqhash.Hash2(sequence, sequenceType, circular, doubleStranded)
+	sequenceSeqhash, _ := seqhash.HashV2(sequence, sequenceType, circular, doubleStranded)
 	fmt.Println(sequenceSeqhash)
 	// Output: [36 244 2 143 147 224 140 92 35 203 184 218 161 137 176 169]
 }
diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
index 7c9fe0ee9..21daa9e73 100644
--- a/seqhash/seqhash.go
+++ b/seqhash/seqhash.go
@@ -332,8 +332,8 @@ func DecodeFlag(flag byte) (int, SequenceType, bool, bool) {
 	return version, sequenceType, circularity, doubleStranded
 }
 
-// Hash2 creates a version 2 seqhash.
-func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) {
+// HashV2 creates a version 2 seqhash.
+func HashV2(sequence string, sequenceType SequenceType, circular bool, doubleStranded bool) ([16]byte, error) {
 	var result [16]byte
 
 	// First, get the determistic sequence of the hash
@@ -353,7 +353,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
 	return result, nil
 }
 
-// Hash2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are
+// HashV2Fragment creates a version 2 fragment seqhash. Fragment seqhashes are
 // a special kind of seqhash that are used to identify fragments, usually
 // released by restriction enzyme digestion, rather than complete DNA
 // sequences. This is very useful for tracking genetic parts in a database: as
@@ -386,7 +386,7 @@ func Hash2(sequence string, sequenceType SequenceType, circular bool, doubleStra
 // This means if the input sequence is not less than its reverse complement (for
 // example, GTT is greater than AAC), then the output hash will have the forward
 // and reverse overhang lengths of the reverse complement, not the input strand.
-func Hash2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) {
+func HashV2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength int8) ([16]byte, error) {
 	var result [16]byte
 
 	// First, run checks and get the determistic sequence of the hash
@@ -423,15 +423,15 @@ func Hash2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength in
 	return result, nil
 }
 
-// Hash2MetadataKey is a key for a seqhash v2 single letter metadata tag.
-type Hash2MetadataKey struct {
+// HashV2MetadataKey is a key for a seqhash v2 single letter metadata tag.
+type HashV2MetadataKey struct {
 	SequenceType   SequenceType
 	Circular       bool
 	DoubleStranded bool
 }
 
-// Hash2Metadata contains the seqhash v2 single letter metadata tags.
-var Hash2Metadata = map[Hash2MetadataKey]rune{
+// HashV2Metadata contains the seqhash v2 single letter metadata tags.
+var HashV2Metadata = map[HashV2MetadataKey]rune{
 	{DNA, true, true}:        'A',
 	{DNA, true, false}:       'B',
 	{DNA, false, true}:       'C',
@@ -448,12 +448,12 @@ var Hash2Metadata = map[Hash2MetadataKey]rune{
 	{FRAGMENT, true, true}:   'N',
 }
 
-// EncodeHash2 encodes Hash2 as a base64 string. It also adds a single
+// EncodeHashV2 encodes HashV2 as a base64 string. It also adds a single
 // letter metadata tag that can be used as an easy heuristic for an LLM to
 // identify misbehaving code.
-func EncodeHash2(hash [16]byte, err error) (string, error) {
+func EncodeHashV2(hash [16]byte, err error) (string, error) {
 	_, sequenceType, circularity, doubleStranded := DecodeFlag(hash[0])
 	encoded := base64.StdEncoding.EncodeToString(hash[:])
 
-	return string(Hash2Metadata[Hash2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err
+	return string(HashV2Metadata[HashV2MetadataKey{sequenceType, circularity, doubleStranded}]) + "_" + encoded, err
 }
diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go
index 352f30c9c..f4da6648d 100644
--- a/seqhash/seqhash_test.go
+++ b/seqhash/seqhash_test.go
@@ -102,22 +102,22 @@ func TestFlagEncoding(t *testing.T) {
 	}
 }
 
-func TestHash2(t *testing.T) {
+func TestHashV2(t *testing.T) {
 	// Test TNA as sequenceType
-	_, err := Hash2("ATGGGCTAA", "TNA", true, true)
+	_, err := HashV2("ATGGGCTAA", "TNA", true, true)
 	if err == nil {
-		t.Errorf("TestHash2() has failed. TNA is not a valid sequenceType.")
+		t.Errorf("TestHashV2() has failed. TNA is not a valid sequenceType.")
 	}
 }
 
-func TestHash2Fragment(t *testing.T) {
+func TestHashV2Fragment(t *testing.T) {
 	// Test X failure
-	_, err := Hash2Fragment("ATGGGCTAX", 4, 4)
+	_, err := HashV2Fragment("ATGGGCTAX", 4, 4)
 	if err == nil {
-		t.Errorf("TestHash2Fragment() has failed. X is not a valid sequenceType.")
+		t.Errorf("TestHashV2Fragment() has failed. X is not a valid sequenceType.")
 	}
 	// Test actual hash
-	sqHash, _ := EncodeHash2(Hash2Fragment("ATGGGCTAA", 4, 4))
+	sqHash, _ := EncodeHashV2(HashV2Fragment("ATGGGCTAA", 4, 4))
 	expectedHash := "K_IwQEwsn8RN9yA1CCoVLpSw=="
 	if sqHash != expectedHash {
 		t.Errorf("Expected %s, Got: %s", expectedHash, sqHash)

From 9ae38fab9b8a93af7d16d96901378c48e2505cd5 Mon Sep 17 00:00:00 2001
From: Timothy Stiles <tim@stiles.io>
Date: Wed, 29 Nov 2023 12:45:06 -0800
Subject: [PATCH 6/7] renamed fwd and rev

---
 seqhash/seqhash.go | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
index 21daa9e73..f4ec1a82e 100644
--- a/seqhash/seqhash.go
+++ b/seqhash/seqhash.go
@@ -396,25 +396,25 @@ func HashV2Fragment(sequence string, fwdOverhangLength int8, revOverhangLength i
 		}
 	}
 	sequence = strings.ToUpper(sequence)
-	var rev, fwd int8
+	var forward, reverse int8
 	var deterministicSequence string
 	reverseComplement := transform.ReverseComplement(sequence)
 	if sequence > reverseComplement {
-		// If the reverse complement is smaller, reverse the overhangs fwd and rev
-		rev = fwdOverhangLength
-		fwd = revOverhangLength
+		// If the reverse complement is smaller, reverse the overhangs forward and reverse
+		forward = revOverhangLength
+		reverse = fwdOverhangLength
 		deterministicSequence = reverseComplement
 	} else {
-		fwd = fwdOverhangLength
-		rev = revOverhangLength
+		forward = fwdOverhangLength
+		reverse = revOverhangLength
 		deterministicSequence = sequence
 	}
 
 	// Build our byte flag and copy length flags
 	flag := EncodeFlag(2, FRAGMENT, false, false)
 	result[0] = flag
-	result[1] = byte(fwd)
-	result[2] = byte(rev)
+	result[1] = byte(forward)
+	result[2] = byte(reverse)
 
 	// Compute BLAKE3, then copy those to the remaining 13 bytes
 	newhash := blake3.Sum256([]byte(deterministicSequence))

From 44e9cced105e857ce8266558ebb01817f05da69a Mon Sep 17 00:00:00 2001
From: Keoni Gandall <koeng101@gmail.com>
Date: Wed, 6 Dec 2023 08:26:31 -0800
Subject: [PATCH 7/7] Add top level comment

---
 seqhash/seqhash.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
index f4ec1a82e..5613ff0a5 100644
--- a/seqhash/seqhash.go
+++ b/seqhash/seqhash.go
@@ -3,6 +3,9 @@ Package seqhash contains the seqhash algorithm.
 
 This package contains the reference seqhash algorithm.
 
+If you are new to using seqhash, use V2. V1 should only be used in situations
+where full 256 rather than 120 bit hashing is needed.
+
 There is a big problem with current sequence databases - they all use different
 identifiers and accession numbers. This means cross-referencing databases is
 a complicated exercise, especially as the quantity of databases increases, or if