From 4b25a31739b35d74aff235d320839e3a481c25ae Mon Sep 17 00:00:00 2001 From: Giacomo Tartari Date: Sun, 23 Jul 2023 01:50:39 +0200 Subject: [PATCH] Seqfold (#295) * feat: add seqfold Add seqfold package, Go port of the python project at: https://github.com/Lattice-Automation/seqfold Add general useful types and maps of energies used to calculate the folding structures. Add to checks/checks.go a couple of function to determine if a sequence is DNA or RNA. This is need by `seqfold` to discriminate for example the energies used in the folding. * feat: add seqfold functions Add the functions to calculate the folding energies, the `Fold` function is the core function to calculate the folding structures and energies of a nucleotide sequence. The result can be use to print more informations, e.g. dot-bracket notation of the folded sequence. * feat: add seqfolg tests and example Add the unit tests ported from the python codebase and a small example. * renamed seqfold to fold. * renamed FoldEnergy to MinimumFreeEnergy. * renamed s to structure in most of fold.go. * renamed DNA and RNA energy constants. * renamed fc to foldContext * renamed H and S in energies to EnthalpyH and EntropyS. * changed NN to nearest neighbors. * renamed BP energy to Matching Basepair energy and s to structure. * more variable renaming. * renamed V and W. * renamed V and W and fixed errors... * fixed all underscored names in fold_test.go and moved to fold package. * fixed accidentally modified rebase test data. * made most function private. * more public to private functions and structs * random: add random.RNASequence * transform: add RNA specific transformations * fold: use the transform package for complement And some cleanup * added minimal biological context and package level comments. * fold: add fold.Result, un-export most of lib Add `fold.Resutl` to hold the result of fold.Fold and move there as methods `MinimumFreeEnergy()` and `DotBracket()`. In this way we can keep the API surface small without compromising the package functionality, more methods can be added to `fold.Result` in case we need them, e.g. `result.JSON()` to obtain a representation of the resulting structure in JSON or other formats. * fold: move 1600 magic number in a `const` Add some explanation as well. * fold: move the 30 magic number to a const Add a small explanation as well. * fold: rename variables Rename some variables in `unpairedMinimumFreeEnergyW()` * fold: move magic number 4 in a const It seem to be used as a minimum length for which stable nucleotide structures can be made. * fold: more variable renaming and cleanup * fold: more var and const rename and refactor * fold: more renaming * fold: more magic numbers lifted in constant * changed fold.Fold to fold.Zuker. * changed ExampleFold to ExampleZuker. --------- Co-authored-by: Timothy Stiles Co-authored-by: Tim --- checks/checks.go | 24 + checks/checks_test.go | 62 +++ fold/dna.go | 609 ++++++++++++++++++++++ fold/example_test.go | 14 + fold/fold.go | 890 ++++++++++++++++++++++++++++++++ fold/fold_test.go | 203 ++++++++ fold/rna.go | 899 +++++++++++++++++++++++++++++++++ fold/seqfold.go | 248 +++++++++ fold/utils.go | 19 + go.mod | 4 +- go.sum | 9 +- io/rebase/data/rebase_test.txt | 2 +- random/random.go | 19 +- transform/transform.go | 92 +++- transform/transform_test.go | 64 ++- 15 files changed, 3140 insertions(+), 18 deletions(-) create mode 100644 fold/dna.go create mode 100644 fold/example_test.go create mode 100644 fold/fold.go create mode 100644 fold/fold_test.go create mode 100644 fold/rna.go create mode 100644 fold/seqfold.go create mode 100644 fold/utils.go diff --git a/checks/checks.go b/checks/checks.go index 86d6d147..23a79740 100644 --- a/checks/checks.go +++ b/checks/checks.go @@ -23,3 +23,27 @@ func GcContent(sequence string) float64 { GuanineAndCytosinePercentage := float64(GuanineCount+CytosineCount) / float64(len(sequence)) return GuanineAndCytosinePercentage } + +func IsDNA(seq string) bool { + for _, base := range seq { + switch base { + case 'A', 'C', 'T', 'G': + continue + default: + return false + } + } + return true +} + +func IsRNA(seq string) bool { + for _, base := range seq { + switch base { + case 'A', 'C', 'U', 'G': + continue + default: + return false + } + } + return true +} diff --git a/checks/checks_test.go b/checks/checks_test.go index c73e2c06..d1862d97 100644 --- a/checks/checks_test.go +++ b/checks/checks_test.go @@ -24,3 +24,65 @@ func TestGcContent(t *testing.T) { t.Errorf("GcContent did not properly calculate GC content") } } + +func TestIsDNA(t *testing.T) { + tests := []struct { + name string + args string + want bool + }{ + { + name: "Success", + args: "GATTACA", + want: true, + }, + { + name: "FailRNA", + args: "GAUUACA", + want: false, + }, + { + name: "FailUnknown", + args: "RANDOM STRING", + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := checks.IsDNA(tt.args); got != tt.want { + t.Errorf("IsDNA() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestIsRNA(t *testing.T) { + tests := []struct { + name string + args string + want bool + }{ + { + name: "Success", + args: "GAUUACA", + want: true, + }, + { + name: "FailDNA", + args: "GATTACA", + want: false, + }, + { + name: "FailUnknown", + args: "RANDOM STRING", + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := checks.IsRNA(tt.args); got != tt.want { + t.Errorf("IsRNA() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/fold/dna.go b/fold/dna.go new file mode 100644 index 00000000..4ed2c416 --- /dev/null +++ b/fold/dna.go @@ -0,0 +1,609 @@ +package fold + +import "github.com/TimothyStiles/poly/transform" + +var dnaComplement = transform.ComplementBase + +var dnaMultibranch = multibranchEnergies{helicesCount: 2.6, unpairedCount: 0.2, coaxialStackCount: 0.2, terminalMismatchCount: 2.0} + +// The Thermodynamics of DNA Structural Motifs +// SantaLucia and Hicks, 2004 +var dnaNearestNeighbors = matchingBasepairEnergy{"AA/TT": energy{enthalpyH: -7.6, entropyS: -21.3}, + "AC/TG": {enthalpyH: -8.4, entropyS: -22.4}, + "AG/TC": {enthalpyH: -7.8, entropyS: -21}, + "AT/TA": {enthalpyH: -7.2, entropyS: -20.4}, + "C/G_tini": {enthalpyH: 0, entropyS: 0}, + "CA/GT": {enthalpyH: -8.5, entropyS: -22.7}, + "CC/GG": {enthalpyH: -8, entropyS: -19.9}, + "CG/GC": {enthalpyH: -10.6, entropyS: -27.2}, + "CT/GA": {enthalpyH: -7.8, entropyS: -21}, + "GA/CT": {enthalpyH: -8.2, entropyS: -22.2}, + "GC/CG": {enthalpyH: -9.8, entropyS: -24.4}, + "GG/CC": {enthalpyH: -8, entropyS: -19.9}, + "GT/CA": {enthalpyH: -8.4, entropyS: -22.4}, + "T/A_tini": {enthalpyH: 2.2, entropyS: 6.9}, + "TA/AT": {enthalpyH: -7.2, entropyS: -21.3}, + "TC/AG": {enthalpyH: -8.2, entropyS: -22.2}, + "TG/AC": {enthalpyH: -8.5, entropyS: -22.7}, + "TT/AA": {enthalpyH: -7.6, entropyS: -21.3}, + "init": {enthalpyH: 0.2, entropyS: -5.7}, + "init_A/T": {enthalpyH: 2.2, entropyS: 6.9}, + "init_G/C": {enthalpyH: 0, entropyS: 0}, + "mys": {enthalpyH: 0, entropyS: -1.4}, + "sym": {enthalpyH: 0, entropyS: -1.4}, + "tini": {enthalpyH: 0.2, entropyS: -5.7}} + +// DNAInternalMismatches is an Internal mismatch table (DNA) +// Allawi & SantaLucia (1997), Biochemistry 36: 10581-10594 +// Allawi & SantaLucia (1998), Biochemistry 37: 9435-9444 +// Allawi & SantaLucia (1998), Biochemistry 37: 2170-2179 * +// Allawi & SantaLucia (1998), Nucl Acids Res 26: 2694-2701 * +// Peyret et al. (1999), Biochemistry 38: 3468-3477 * +var dnaInternalMismatches = matchingBasepairEnergy{"AA/AT": energy{enthalpyH: 4.7, entropyS: 12.9}, + "AA/CT": {enthalpyH: 7.6, entropyS: 20.2}, + "AA/GT": {enthalpyH: 3, entropyS: 7.4}, + "AA/TA": {enthalpyH: 1.2, entropyS: 1.7}, + "AA/TC": {enthalpyH: 2.3, entropyS: 4.6}, + "AA/TG": {enthalpyH: -0.6, entropyS: -2.3}, + "AC/AG": {enthalpyH: -2.9, entropyS: -9.8}, + "AC/CG": {enthalpyH: -0.7, entropyS: -3.8}, + "AC/GG": {enthalpyH: 0.5, entropyS: 3.2}, + "AC/TA": {enthalpyH: 5.3, entropyS: 14.6}, + "AC/TC": {enthalpyH: 0, entropyS: -4.4}, + "AC/TT": {enthalpyH: 0.7, entropyS: 0.2}, + "AG/AC": {enthalpyH: -0.9, entropyS: -4.2}, + "AG/CC": {enthalpyH: 0.6, entropyS: -0.6}, + "AG/GC": {enthalpyH: -4, entropyS: -13.2}, + "AG/TA": {enthalpyH: -0.7, entropyS: -2.3}, + "AG/TG": {enthalpyH: -3.1, entropyS: -9.5}, + "AG/TT": {enthalpyH: 1, entropyS: 0.9}, + "AT/AA": {enthalpyH: 1.2, entropyS: 1.7}, + "AT/CA": {enthalpyH: 5.3, entropyS: 14.6}, + "AT/GA": {enthalpyH: -0.7, entropyS: -2.3}, + "AT/TC": {enthalpyH: -1.2, entropyS: -6.2}, + "AT/TG": {enthalpyH: -2.5, entropyS: -8.3}, + "AT/TT": {enthalpyH: -2.7, entropyS: -10.8}, + "CA/AT": {enthalpyH: 3.4, entropyS: 8}, + "CA/CT": {enthalpyH: 6.1, entropyS: 16.4}, + "CA/GA": {enthalpyH: -0.9, entropyS: -4.2}, + "CA/GC": {enthalpyH: 1.9, entropyS: 3.7}, + "CA/GG": {enthalpyH: -0.7, entropyS: -2.3}, + "CA/TT": {enthalpyH: 1, entropyS: 0.7}, + "CC/AG": {enthalpyH: 5.2, entropyS: 14.2}, + "CC/CG": {enthalpyH: 3.6, entropyS: 8.9}, + "CC/GA": {enthalpyH: 0.6, entropyS: -0.6}, + "CC/GC": {enthalpyH: -1.5, entropyS: -7.2}, + "CC/GT": {enthalpyH: -0.8, entropyS: -4.5}, + "CC/TG": {enthalpyH: 5.2, entropyS: 13.5}, + "CG/AC": {enthalpyH: 1.9, entropyS: 3.7}, + "CG/CC": {enthalpyH: -1.5, entropyS: -7.2}, + "CG/GA": {enthalpyH: -4, entropyS: -13.2}, + "CG/GG": {enthalpyH: -4.9, entropyS: -15.3}, + "CG/GT": {enthalpyH: -4.1, entropyS: -11.7}, + "CG/TC": {enthalpyH: -1.5, entropyS: -6.1}, + "CT/AA": {enthalpyH: 2.3, entropyS: 4.6}, + "CT/CA": {enthalpyH: 0, entropyS: -4.4}, + "CT/GC": {enthalpyH: -1.5, entropyS: -6.1}, + "CT/GG": {enthalpyH: -2.8, entropyS: -8}, + "CT/GT": {enthalpyH: -5, entropyS: -15.8}, + "CT/TA": {enthalpyH: -1.2, entropyS: -6.2}, + "GA/AT": {enthalpyH: 0.7, entropyS: 0.7}, + "GA/CA": {enthalpyH: -2.9, entropyS: -9.8}, + "GA/CC": {enthalpyH: 5.2, entropyS: 14.2}, + "GA/CG": {enthalpyH: -0.6, entropyS: -1}, + "GA/GT": {enthalpyH: 1.6, entropyS: 3.6}, + "GA/TT": {enthalpyH: -1.3, entropyS: -5.3}, + "GC/AG": {enthalpyH: -0.6, entropyS: -1}, + "GC/CA": {enthalpyH: -0.7, entropyS: -3.8}, + "GC/CC": {enthalpyH: 3.6, entropyS: 8.9}, + "GC/CT": {enthalpyH: 2.3, entropyS: 5.4}, + "GC/GG": {enthalpyH: -6, entropyS: -15.8}, + "GC/TG": {enthalpyH: -4.4, entropyS: -12.3}, + "GG/AC": {enthalpyH: -0.7, entropyS: -2.3}, + "GG/CA": {enthalpyH: 0.5, entropyS: 3.2}, + "GG/CG": {enthalpyH: -6, entropyS: -15.8}, + "GG/CT": {enthalpyH: 3.3, entropyS: 10.4}, + "GG/GC": {enthalpyH: -4.9, entropyS: -15.3}, + "GG/TC": {enthalpyH: -2.8, entropyS: -8}, + "GG/TT": {enthalpyH: 5.8, entropyS: 16.3}, + "GT/AA": {enthalpyH: -0.6, entropyS: -2.3}, + "GT/CC": {enthalpyH: 5.2, entropyS: 13.5}, + "GT/CG": {enthalpyH: -4.4, entropyS: -12.3}, + "GT/CT": {enthalpyH: -2.2, entropyS: -8.4}, + "GT/GA": {enthalpyH: -3.1, entropyS: -9.5}, + "GT/TA": {enthalpyH: -2.5, entropyS: -8.3}, + "GT/TG": {enthalpyH: 4.1, entropyS: 9.5}, + "TA/AA": {enthalpyH: 4.7, entropyS: 12.9}, + "TA/AC": {enthalpyH: 3.4, entropyS: 8}, + "TA/AG": {enthalpyH: 0.7, entropyS: 0.7}, + "TA/CT": {enthalpyH: 1.2, entropyS: 0.7}, + "TA/GT": {enthalpyH: -0.1, entropyS: -1.7}, + "TA/TT": {enthalpyH: 0.2, entropyS: -1.5}, + "TC/AA": {enthalpyH: 7.6, entropyS: 20.2}, + "TC/AC": {enthalpyH: 6.1, entropyS: 16.4}, + "TC/AT": {enthalpyH: 1.2, entropyS: 0.7}, + "TC/CG": {enthalpyH: 2.3, entropyS: 5.4}, + "TC/GG": {enthalpyH: 3.3, entropyS: 10.4}, + "TC/TG": {enthalpyH: -2.2, entropyS: -8.4}, + "TG/AA": {enthalpyH: 3, entropyS: 7.4}, + "TG/AG": {enthalpyH: 1.6, entropyS: 3.6}, + "TG/AT": {enthalpyH: -0.1, entropyS: -1.7}, + "TG/CC": {enthalpyH: -0.8, entropyS: -4.5}, + "TG/GC": {enthalpyH: -4.1, entropyS: -11.7}, + "TG/GT": {enthalpyH: -1.4, entropyS: -6.2}, + "TG/TC": {enthalpyH: -5, entropyS: -15.8}, + "TT/AC": {enthalpyH: 1, entropyS: 0.7}, + "TT/AG": {enthalpyH: -1.3, entropyS: -5.3}, + "TT/AT": {enthalpyH: 0.2, entropyS: -1.5}, + "TT/CA": {enthalpyH: 0.7, entropyS: 0.2}, + "TT/GA": {enthalpyH: 1, entropyS: 0.9}, + "TT/GG": {enthalpyH: 5.8, entropyS: 16.3}, + "TT/TA": {enthalpyH: -2.7, entropyS: -10.8}, +} + +// DNATerminalMismatches is a terminal mismatch table for DNA found at terminating ends of a structure +// SantaLucia & Peyret (2001) Patent Application WO 01/94611 +var dnaTerminalMismatches = matchingBasepairEnergy{ + "AA/AT": energy{enthalpyH: -2.5, entropyS: -6.3}, + "AA/CT": {enthalpyH: -2.7, entropyS: -7}, + "AA/GT": {enthalpyH: -2.4, entropyS: -5.8}, + "AA/TA": {enthalpyH: -3.1, entropyS: -7.8}, + "AA/TC": {enthalpyH: -1.6, entropyS: -4}, + "AA/TG": {enthalpyH: -1.9, entropyS: -4.4}, + "AC/AG": {enthalpyH: -8, entropyS: -22.5}, + "AC/CG": {enthalpyH: -3.2, entropyS: -7.1}, + "AC/GG": {enthalpyH: -4.6, entropyS: -11.4}, + "AC/TA": {enthalpyH: -1.8, entropyS: -3.8}, + "AC/TC": {enthalpyH: -0.1, entropyS: 0.5}, + "AC/TT": {enthalpyH: -0.9, entropyS: -1.7}, + "AG/AC": {enthalpyH: -4.3, entropyS: -10.7}, + "AG/CC": {enthalpyH: -2.7, entropyS: -6}, + "AG/GC": {enthalpyH: -6, entropyS: -15.5}, + "AG/TA": {enthalpyH: -2.5, entropyS: -5.9}, + "AG/TG": {enthalpyH: -1.1, entropyS: -2.1}, + "AG/TT": {enthalpyH: -3.2, entropyS: -8.7}, + "AT/AA": {enthalpyH: -3.1, entropyS: -7.8}, + "AT/CA": {enthalpyH: -1.8, entropyS: -3.8}, + "AT/GA": {enthalpyH: -2.5, entropyS: -5.9}, + "AT/TC": {enthalpyH: -2.3, entropyS: -6.3}, + "AT/TG": {enthalpyH: -3.5, entropyS: -9.4}, + "AT/TT": {enthalpyH: -2.4, entropyS: -6.5}, + "CA/AT": {enthalpyH: -2.3, entropyS: -5.9}, + "CA/CT": {enthalpyH: -0.7, entropyS: -1.3}, + "CA/GA": {enthalpyH: -4.3, entropyS: -10.7}, + "CA/GC": {enthalpyH: -2.6, entropyS: -5.9}, + "CA/GG": {enthalpyH: -3.9, entropyS: -9.6}, + "CA/TT": {enthalpyH: -0.7, entropyS: -1.2}, + "CC/AG": {enthalpyH: -5, entropyS: -13.8}, + "CC/CG": {enthalpyH: -3.9, entropyS: -10.6}, + "CC/GA": {enthalpyH: -2.7, entropyS: -6}, + "CC/GC": {enthalpyH: -2.1, entropyS: -5.1}, + "CC/GT": {enthalpyH: -3.2, entropyS: -8}, + "CC/TG": {enthalpyH: -3, entropyS: -7.8}, + "CG/AC": {enthalpyH: -2.6, entropyS: -5.9}, + "CG/CC": {enthalpyH: -2.1, entropyS: -5.1}, + "CG/GA": {enthalpyH: -6, entropyS: -15.5}, + "CG/GG": {enthalpyH: -3.8, entropyS: -9.5}, + "CG/GT": {enthalpyH: -3.8, entropyS: -9}, + "CG/TC": {enthalpyH: -3.9, entropyS: -10.6}, + "CT/AA": {enthalpyH: -1.6, entropyS: -4}, + "CT/CA": {enthalpyH: -0.1, entropyS: 0.5}, + "CT/GC": {enthalpyH: -3.9, entropyS: -10.6}, + "CT/GG": {enthalpyH: -6.6, entropyS: -18.7}, + "CT/GT": {enthalpyH: -6.1, entropyS: -16.9}, + "CT/TA": {enthalpyH: -2.3, entropyS: -6.3}, + "GA/AT": {enthalpyH: -2, entropyS: -4.7}, + "GA/CA": {enthalpyH: -8, entropyS: -22.5}, + "GA/CC": {enthalpyH: -5, entropyS: -13.8}, + "GA/CG": {enthalpyH: -4.3, entropyS: -11.1}, + "GA/GT": {enthalpyH: -1.1, entropyS: -2.7}, + "GA/TT": {enthalpyH: -3.6, entropyS: -9.8}, + "GC/AG": {enthalpyH: -4.3, entropyS: -11.1}, + "GC/CA": {enthalpyH: -3.2, entropyS: -7.1}, + "GC/CC": {enthalpyH: -3.9, entropyS: -10.6}, + "GC/CT": {enthalpyH: -4.9, entropyS: -13.5}, + "GC/GG": {enthalpyH: -0.7, entropyS: -19.2}, + "GC/TG": {enthalpyH: -5.9, entropyS: -16.1}, + "GG/AC": {enthalpyH: -3.9, entropyS: -9.6}, + "GG/CA": {enthalpyH: -4.6, entropyS: -11.4}, + "GG/CG": {enthalpyH: -0.7, entropyS: -19.2}, + "GG/CT": {enthalpyH: -5.7, entropyS: -15.9}, + "GG/GC": {enthalpyH: -3.8, entropyS: -9.5}, + "GG/TC": {enthalpyH: -6.6, entropyS: -18.7}, + "GT/AA": {enthalpyH: -1.9, entropyS: -4.4}, + "GT/CC": {enthalpyH: -3, entropyS: -7.8}, + "GT/CG": {enthalpyH: -5.9, entropyS: -16.1}, + "GT/CT": {enthalpyH: -7.4, entropyS: -21.2}, + "GT/GA": {enthalpyH: -1.1, entropyS: -2.1}, + "GT/TA": {enthalpyH: -3.5, entropyS: -9.4}, + "TA/AA": {enthalpyH: -2.5, entropyS: -6.3}, + "TA/AC": {enthalpyH: -2.3, entropyS: -5.9}, + "TA/AG": {enthalpyH: -2, entropyS: -4.7}, + "TA/CT": {enthalpyH: -2.5, entropyS: -6.3}, + "TA/GT": {enthalpyH: -3.9, entropyS: -10.5}, + "TA/TT": {enthalpyH: -3.2, entropyS: -8.9}, + "TC/AA": {enthalpyH: -2.7, entropyS: -7}, + "TC/AC": {enthalpyH: -0.7, entropyS: -1.3}, + "TC/AT": {enthalpyH: -2.5, entropyS: -6.3}, + "TC/CG": {enthalpyH: -4.9, entropyS: -13.5}, + "TC/GG": {enthalpyH: -5.7, entropyS: -15.9}, + "TC/TG": {enthalpyH: -7.4, entropyS: -21.2}, + "TG/AA": {enthalpyH: -2.4, entropyS: -5.8}, + "TG/AG": {enthalpyH: -1.1, entropyS: -2.7}, + "TG/AT": {enthalpyH: -3.9, entropyS: -10.5}, + "TG/CC": {enthalpyH: -3.2, entropyS: -8}, + "TG/GC": {enthalpyH: -3.8, entropyS: -9}, + "TG/TC": {enthalpyH: -6.1, entropyS: -16.9}, + "TT/AC": {enthalpyH: -0.7, entropyS: -1.2}, + "TT/AG": {enthalpyH: -3.6, entropyS: -9.8}, + "TT/AT": {enthalpyH: -3.2, entropyS: -8.9}, + "TT/CA": {enthalpyH: -0.9, entropyS: -1.7}, + "TT/GA": {enthalpyH: -3.2, entropyS: -8.7}, + "TT/TA": {enthalpyH: -2.4, entropyS: -6.5}, +} + +// DNA dangling ends +// +// Bommarito et al. (2000), Nucl Acids Res 28: 1929-1934 +var dnaDanglingEnds = matchingBasepairEnergy{ + "AA/.T": {enthalpyH: 0.2, entropyS: 2.3}, + "AA/T.": {enthalpyH: -0.5, entropyS: -1.1}, + ".A/AT": {enthalpyH: -0.7, entropyS: -0.8}, + "AC/.G": {enthalpyH: -6.3, entropyS: -17.1}, + ".A/CT": {enthalpyH: 4.4, entropyS: 14.9}, + "AC/T.": {enthalpyH: 4.7, entropyS: 14.2}, + "AG/.C": {enthalpyH: -3.7, entropyS: -10}, + ".A/GT": {enthalpyH: -1.6, entropyS: -3.6}, + "AG/T.": {enthalpyH: -4.1, entropyS: -13.1}, + "A./TA": {enthalpyH: -2.9, entropyS: -7.6}, + "AT/.A": {enthalpyH: -2.9, entropyS: -7.6}, + "A./TC": {enthalpyH: -4.1, entropyS: -13}, + "A./TG": {enthalpyH: -4.2, entropyS: -15}, + "A./TT": {enthalpyH: -0.2, entropyS: -0.5}, + ".A/TT": {enthalpyH: 2.9, entropyS: 10.4}, + "AT/T.": {enthalpyH: -3.8, entropyS: -12.6}, + ".C/AG": {enthalpyH: -2.1, entropyS: -3.9}, + "CA/G.": {enthalpyH: -5.9, entropyS: -16.5}, + "CA/.T": {enthalpyH: 0.6, entropyS: 3.3}, + ".C/CG": {enthalpyH: -0.2, entropyS: -0.1}, + "CC/G.": {enthalpyH: -2.6, entropyS: -7.4}, + "CC/.G": {enthalpyH: -4.4, entropyS: -12.6}, + "C./GA": {enthalpyH: -3.7, entropyS: -10}, + "C./GC": {enthalpyH: -4, entropyS: -11.9}, + "CG/.C": {enthalpyH: -4, entropyS: -11.9}, + "CG/G.": {enthalpyH: -3.2, entropyS: -10.4}, + "C./GG": {enthalpyH: -3.9, entropyS: -10.9}, + ".C/GG": {enthalpyH: -3.9, entropyS: -11.2}, + "C./GT": {enthalpyH: -4.9, entropyS: -13.8}, + "CT/.A": {enthalpyH: -4.1, entropyS: -13}, + ".C/TG": {enthalpyH: -4.4, entropyS: -13.1}, + "CT/G.": {enthalpyH: -5.2, entropyS: -15}, + "GA/C.": {enthalpyH: -2.1, entropyS: -3.9}, + ".G/AC": {enthalpyH: -5.9, entropyS: -16.5}, + "GA/.T": {enthalpyH: -1.1, entropyS: -1.6}, + "G./CA": {enthalpyH: -6.3, entropyS: -17.1}, + "GC/C.": {enthalpyH: -0.2, entropyS: -0.1}, + ".G/CC": {enthalpyH: -2.6, entropyS: -7.4}, + "G./CC": {enthalpyH: -4.4, entropyS: -12.6}, + "G./CG": {enthalpyH: -5.1, entropyS: -14}, + "GC/.G": {enthalpyH: -5.1, entropyS: -14}, + "G./CT": {enthalpyH: -4, entropyS: -10.9}, + ".G/GC": {enthalpyH: -3.2, entropyS: -10.4}, + "GG/.C": {enthalpyH: -3.9, entropyS: -10.9}, + "GG/C.": {enthalpyH: -3.9, entropyS: -11.2}, + "GT/.A": {enthalpyH: -4.2, entropyS: -15}, + "GT/C.": {enthalpyH: -4.4, entropyS: -13.1}, + ".G/TC": {enthalpyH: -5.2, entropyS: -15}, + "T./AA": {enthalpyH: 0.2, entropyS: 2.3}, + ".T/AA": {enthalpyH: -0.5, entropyS: -1.1}, + "TA/A.": {enthalpyH: -0.7, entropyS: -0.8}, + "T./AC": {enthalpyH: 0.6, entropyS: 3.3}, + "T./AG": {enthalpyH: -1.1, entropyS: -1.6}, + "T./AT": {enthalpyH: -6.9, entropyS: -20}, + "TA/.T": {enthalpyH: -6.9, entropyS: -20}, + "TC/A.": {enthalpyH: 4.4, entropyS: 14.9}, + ".T/CA": {enthalpyH: 4.7, entropyS: 14.2}, + "TC/.G": {enthalpyH: -4, entropyS: -10.9}, + "TG/A.": {enthalpyH: -1.6, entropyS: -3.6}, + ".T/GA": {enthalpyH: -4.1, entropyS: -13.1}, + "TG/.C": {enthalpyH: -4.9, entropyS: -13.8}, + "TT/.A": {enthalpyH: -0.2, entropyS: -0.5}, + "TT/A.": {enthalpyH: 2.9, entropyS: 10.4}, + ".T/TA": {enthalpyH: -3.8, entropyS: -12.6}, +} + +// Experimental delta EnthalpyH and delta EntropyS for tri/tetra loops +// +// Supplemental Material: Annu.Rev.Biophs.Biomol.Struct.33:415-40 +// doi: 10.1146/annurev.biophys.32.110601.141800 +// The Thermodynamics of DNA Structural Motifs +// SantaLucia and Hicks, 2004 +// +// delta EntropyS was computed using delta G and delta EnthalpyH and is in cal / (K x mol) +// (versus delta EnthalpyH in kcal / mol) +var dnaTriTetraLoops = matchingBasepairEnergy{ + "AGAAT": {-1.5, 0.0}, + "AGCAT": {-1.5, 0.0}, + "AGGAT": {-1.5, 0.0}, + "AGTAT": {-1.5, 0.0}, + "CGAAG": {-2.0, 0.0}, + "CGCAG": {-2.0, 0.0}, + "CGGAG": {-2.0, 0.0}, + "CGTAG": {-2.0, 0.0}, + "GGAAC": {-2.0, 0.0}, + "GGCAC": {-2.0, 0.0}, + "GGGAC": {-2.0, 0.0}, + "GGTAC": {-2.0, 0.0}, + "TGAAA": {-1.5, 0.0}, + "TGCAA": {-1.5, 0.0}, + "TGGAA": {-1.5, 0.0}, + "TGTAA": {-1.5, 0.0}, + "AAAAAT": {0.5, 0.6}, + "AAAACT": {0.7, -1.6}, + "AAACAT": {1.0, -1.6}, + "ACTTGT": {0.0, -4.2}, + "AGAAAT": {-1.1, -1.6}, + "AGAGAT": {-1.1, -1.6}, + "AGATAT": {-1.5, -1.6}, + "AGCAAT": {-1.6, -1.6}, + "AGCGAT": {-1.1, -1.6}, + "AGCTTT": {0.2, -1.6}, + "AGGAAT": {-1.1, -1.6}, + "AGGGAT": {-1.1, -1.6}, + "AGGGGT": {0.5, -0.6}, + "AGTAAT": {-1.6, -1.6}, + "AGTGAT": {-1.1, -1.6}, + "AGTTCT": {0.8, -1.6}, + "ATTCGT": {-0.2, -1.6}, + "ATTTGT": {0.0, -1.6}, + "ATTTTT": {-0.5, -1.6}, + "CAAAAG": {0.5, 1.3}, + "CAAACG": {0.7, 0.0}, + "CAACAG": {1.0, 0.0}, + "CAACCG": {0.0, 0.0}, + "CCTTGG": {0.0, -2.6}, + "CGAAAG": {-1.1, 0.0}, + "CGAGAG": {-1.1, 0.0}, + "CGATAG": {-1.5, 0.0}, + "CGCAAG": {-1.6, 0.0}, + "CGCGAG": {-1.1, 0.0}, + "CGCTTG": {0.2, 0.0}, + "CGGAAG": {-1.1, 0.0}, + "CGGGAG": {-1.0, 0.0}, + "CGGGGG": {0.5, 1.0}, + "CGTAAG": {-1.6, 0.0}, + "CGTGAG": {-1.1, 0.0}, + "CGTTCG": {0.8, 0.0}, + "CTTCGG": {-0.2, 0.0}, + "CTTTGG": {0.0, 0.0}, + "CTTTTG": {-0.5, 0.0}, + "GAAAAC": {0.5, 3.2}, + "GAAACC": {0.7, 0.0}, + "GAACAC": {1.0, 0.0}, + "GCTTGC": {0.0, -2.6}, + "GGAAAC": {-1.1, 0.0}, + "GGAGAC": {-1.1, 0.0}, + "GGATAC": {-1.6, 0.0}, + "GGCAAC": {-1.6, 0.0}, + "GGCGAC": {-1.1, 0.0}, + "GGCTTC": {0.2, 0.0}, + "GGGAAC": {-1.1, 0.0}, + "GGGGAC": {-1.1, 0.0}, + "GGGGGC": {0.5, 1.0}, + "GGTAAC": {-1.6, 0.0}, + "GGTGAC": {-1.1, 0.0}, + "GGTTCC": {0.8, 0.0}, + "GTTCGC": {-0.2, 0.0}, + "GTTTGC": {0.0, 0.0}, + "GTTTTC": {-0.5, 0.0}, + "GAAAAT": {0.5, 3.2}, + "GAAACT": {1.0, 0.0}, + "GAACAT": {1.0, 0.0}, + "GCTTGT": {0.0, -1.6}, + "GGAAAT": {-1.1, 0.0}, + "GGAGAT": {-1.1, 0.0}, + "GGATAT": {-1.6, 0.0}, + "GGCAAT": {-1.6, 0.0}, + "GGCGAT": {-1.1, 0.0}, + "GGCTTT": {-0.1, 0.0}, + "GGGAAT": {-1.1, 0.0}, + "GGGGAT": {-1.1, 0.0}, + "GGGGGT": {0.5, 1.0}, + "GGTAAT": {-1.6, 0.0}, + "GGTGAT": {-1.1, 0.0}, + "GTATAT": {-0.5, 0.0}, + "GTTCGT": {-0.4, 0.0}, + "GTTTGT": {-0.4, 0.0}, + "GTTTTT": {-0.5, 0.0}, + "TAAAAA": {0.5, -0.3}, + "TAAACA": {0.7, -1.6}, + "TAACAA": {1.0, -1.6}, + "TCTTGA": {0.0, -4.2}, + "TGAAAA": {-1.1, -1.6}, + "TGAGAA": {-1.1, -1.6}, + "TGATAA": {-1.6, -1.6}, + "TGCAAA": {-1.6, -1.6}, + "TGCGAA": {-1.1, -1.6}, + "TGCTTA": {0.2, -1.6}, + "TGGAAA": {-1.1, -1.6}, + "TGGGAA": {-1.1, -1.6}, + "TGGGGA": {0.5, -0.6}, + "TGTAAA": {-1.6, -1.6}, + "TGTGAA": {-1.1, -1.6}, + "TGTTCA": {0.8, -1.6}, + "TTTCGA": {-0.2, -1.6}, + "TTTTGA": {0.0, -1.6}, + "TTTTTA": {-0.5, -1.6}, + "TAAAAG": {0.5, 1.6}, + "TAAACG": {1.0, -1.6}, + "TAACAG": {1.0, -1.6}, + "TCTTGG": {0.0, -3.2}, + "TGAAAG": {-1.0, -1.6}, + "TGAGAG": {-1.0, -1.6}, + "TGATAG": {-1.5, -1.6}, + "TGCAAG": {-1.5, -1.6}, + "TGCGAG": {-1.0, -1.6}, + "TGCTTG": {-0.1, -1.6}, + "TGGAAG": {-1.0, -1.6}, + "TGGGAG": {-1.0, -1.6}, + "TGGGGG": {0.5, -0.6}, + "TGTAAG": {-1.5, -1.6}, + "TGTGAG": {-1.0, -1.6}, + "TTTCGG": {-0.4, -1.6}, + "TTTTAG": {-1.0, -1.6}, + "TTTTGG": {-0.4, -1.6}, + "TTTTTG": {-0.5, -1.6}, +} + +// Enthalpy and entropy increments for length dependence of internal loops +// +// Were calculated from delta G Table 4 of SantaLucia, 2004: +// +// Annu.Rev.Biophs.Biomol.Struct.33:415-40 +// doi: 10.1146/annurev.biophys.32.110601.141800 +// The Thermodynamics of DNA Structural Motifs +// SantaLucia and Hicks, 2004 +// +// Additional loop sizes are accounted for with the Jacobson-Stockmayer +// entry extrapolation formula in paper: +// delta G (loop-n) = delta G (loop-x) + 2.44 x R x 310.15 x ln(n / x) +// +// Additional correction is applied for asymmetric loops in paper: +// delta G (asymmetry) = |length A - length B| x 0.3 (kcal / mol) +// where A and B are lengths of both sides of loop +var dnaInternalLoops = loopEnergy{ + 30: {0, -21.3}, + 29: {0, -21.0}, + 28: {0, -21.0}, + 27: {0, -20.6}, + 26: {0, -20.3}, + 25: {0, -20.3}, + 24: {0, -20.0}, + 23: {0, -19.7}, + 22: {0, -19.3}, + 21: {0, -19.0}, + 20: {0, -19.0}, + 19: {0, -18.7}, + 18: {0, -18.7}, + 17: {0, -18.4}, + 16: {0, -18.1}, + 15: {0, -17.7}, + 14: {0, -17.4}, + 13: {0, -16.4}, + 12: {0, -16.8}, + 11: {0, -16.1}, + 10: {0, -15.8}, + 9: {0, -15.8}, + 8: {0, -15.5}, + 7: {0, -14.8}, + 6: {0, -14.2}, + 5: {0, -12.9}, + 4: {0, -11.6}, + 3: {0, -10.3}, + 2: {0, 0}, + 1: {0, 0}, +} + +// Enthalpy and entropy increments for length depedence of bulge loops +// +// Were calculated from delta G Table 4 of SantaLucia, 2004: +// +// Annu.Rev.Biophs.Biomol.Struct.33:415-40 +// doi: 10.1146/annurev.biophys.32.110601.141800 +// The Thermodynamics of DNA Structural Motifs +// SantaLucia and Hicks, 2004 +// +// For bulge loops of size 1, the intervening nearestNeighbors energy is used. +// Closing AT penalty is applied on both sides +var dnaBulgeLoops = loopEnergy{ + 1: {0, -12.9}, + 2: {0, -9.4}, + 3: {0, -10.0}, + 4: {0, -10.3}, + 5: {0, -10.6}, + 6: {0, -11.3}, + 7: {0, -11.9}, + 8: {0, -12.6}, + 9: {0, -13.2}, + 10: {0, -13.9}, + 11: {0, -14.2}, + 12: {0, -14.5}, + 13: {0, -14.8}, + 14: {0, -15.5}, + 15: {0, -15.8}, + 16: {0, -16.1}, + 17: {0, -16.4}, + 18: {0, -16.8}, + 19: {0, -16.8}, + 20: {0, -17.1}, + 21: {0, -17.4}, + 22: {0, -17.4}, + 23: {0, -17.7}, + 24: {0, -17.7}, + 25: {0, -18.1}, + 26: {0, -18.1}, + 27: {0, -18.4}, + 28: {0, -18.7}, + 29: {0, -18.7}, + 30: {0, -19.0}, +} + +// Enthalpy and entropy increments for length depedence of hairpin loops +// +// Were calculated from delta G Table 4 of SantaLucia, 2004: +// +// Annu.Rev.Biophs.Biomol.Struct.33:415-40 +// doi: 10.1146/annurev.biophys.32.110601.141800 +// The Thermodynamics of DNA Structural Motifs +// SantaLucia and Hicks, 2004 +// +// For hairpins of length 3 and 4, the entropy values are looked up +// in the DNATriTetraLoops Dict +// +// From formula 8-9 of the paper: +// An additional 1.6 delta entropy penalty if the hairpin is closed by AT +var dnaHairpinLoops = loopEnergy{ + 1: {0, 0.0}, + 2: {0, 0.0}, + 3: {0, -11.3}, + 4: {0, -11.3}, + 5: {0, -10.6}, + 6: {0, -12.9}, + 7: {0, -13.5}, + 8: {0, -13.9}, + 9: {0, -14.5}, + 10: {0, -14.8}, + 11: {0, -15.5}, + 12: {0, -16.1}, + 13: {0, -16.1}, + 14: {0, -16.4}, + 15: {0, -16.8}, + 16: {0, -17.1}, + 17: {0, -17.4}, + 18: {0, -17.7}, + 19: {0, -18.1}, + 20: {0, -18.4}, + 21: {0, -18.7}, + 22: {0, -18.7}, + 23: {0, -19.0}, + 24: {0, -19.3}, + 25: {0, -19.7}, + 26: {0, -19.7}, + 27: {0, -19.7}, + 28: {0, -20.0}, + 29: {0, -20.0}, + 30: {0, -20.3}, +} + +var dnaEnergies = energies{ + bulgeLoops: dnaBulgeLoops, + complement: dnaComplement, + danglingEnds: dnaDanglingEnds, + hairpinLoops: dnaHairpinLoops, + multibranch: dnaMultibranch, + internalLoops: dnaInternalLoops, + internalMismatches: dnaInternalMismatches, + nearestNeighbors: dnaNearestNeighbors, + terminalMismatches: dnaTerminalMismatches, + triTetraLoops: dnaTriTetraLoops, +} diff --git a/fold/example_test.go b/fold/example_test.go new file mode 100644 index 00000000..7ed6c917 --- /dev/null +++ b/fold/example_test.go @@ -0,0 +1,14 @@ +package fold_test + +import ( + "fmt" + + "github.com/TimothyStiles/poly/fold" +) + +func ExampleZuker() { + result, _ := fold.Zuker("ACCCCCUCCUUCCUUGGAUCAAGGGGCUCAA", 37.0) + brackets := result.DotBracket() + fmt.Println(brackets) + // Output: .((((.(((......)))....)))) +} diff --git a/fold/fold.go b/fold/fold.go new file mode 100644 index 00000000..17f3a379 --- /dev/null +++ b/fold/fold.go @@ -0,0 +1,890 @@ +/* +Package fold is a package for folding DNA and RNA sequences into secondary structures. + +This package provides everything you need to fold a DNA or RNA sequence into a secondary structure +and get the minimum free energy of the structure. Most of the code was ported from the +python SeqFold package by Lattice Automation and Joshua Timmons but we hope to have a +linear fold algorithm in the near future. + +Biological context: + +DNA, RNA, and proteins all fold. Protein is a particularly tricky thing to predict +partially because there are so many more amino acids than there are nucleotides. + +ACG(T/U) vs. ACDEFGHIKLMNPQRSTVWY (20 amino acids) + +These folding predictions help us understand how to design primers, guide RNAs, and +other nucleic acid sequences that fold into a particular structure. + +Fortunately for us, DNA and RNA are much easier to predict because there are only 4 nucleotides +and the rules for folding are much more well defined. + +Each function has citations to the original papers that describe the algorithms used. +Most of the algorithms used in this package are based on the work of Zuker and Stiegler, 1981 +but we're hoping to add more algorithms in the near future such as linear fold. + +TTFN, +Tim +*/ +package fold + +import ( + "fmt" + "math" + "strings" + + "github.com/TimothyStiles/poly/transform" +) + +// Zuker folds the DNA sequence and return the lowest free energy score. +// +// Based on the approach described in: +// Zuker and Stiegler, 1981 +// https://www.ncbi.nlm.nih.gov/pmc/articles/PMC326673/pdf/nar00394-0137.pdf +// +// If the sequence is 50 or more bp long, "isolated" matching bp +// are ignored in pairedMinimumFreeEnergyV(start,end). This is based on an approach described in: +// Mathews, Sabina, Zuker and Turner, 1999 +// https://www.ncbi.nlm.nih.gov/pubmed/10329189 +// Args: +// +// seq: The sequence to Fold +// temp: The temperature the Fold takes place in, in Celsius +// +// Returns a slice of NucleicAcidStructure with the energy and description, +// i.e. stacks, bulges, hairpins, etc. +func Zuker(seq string, temp float64) (Result, error) { + foldContext, err := newFoldingContext(seq, temp) + if err != nil { + return Result{}, fmt.Errorf("error creating folding context: %w", err) + } + + // get the minimum free energy structure out of the cache + + return Result{ + structs: traceback(0, len(seq)-1, foldContext), + }, nil +} + +// unpairedMinimumFreeEnergyW returns the minimum free energy of a subsequence +// at start and terminating at end. +// +// From Zuker and Stiegler, 1981: let W(i,j) be the minimum free energy of all +// possible admissible structures formed from the subsequence Sij. +// +// Figure 2B in Zuker and Stiegler, 1981 +// Args: +// +// seq: The sequence being folded +// start: The start index +// end: The end index (inclusive) +// foldContext: The context for this sequence +// +// Returns the free energy for the subsequence from start to end +func unpairedMinimumFreeEnergyW(start, end int, foldContext context) (nucleicAcidStructure, error) { + if !foldContext.unpairedMinimumFreeEnergyW[start][end].Equal(defaultStructure) { + return foldContext.unpairedMinimumFreeEnergyW[start][end], nil + } + + if end-start < minLenForStruct { + foldContext.unpairedMinimumFreeEnergyW[start][end] = invalidStructure + return foldContext.unpairedMinimumFreeEnergyW[start][end], nil + } + + endDanglingLeft, err := unpairedMinimumFreeEnergyW(start+1, end, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("w: subsequence (%d, %d): %w", start, end, err) + } + endDanglingRight, err := unpairedMinimumFreeEnergyW(start, end-1, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("w: subsequence (%d, %d): %w", start, end, err) + } + endsPaired, err := pairedMinimumFreeEnergyV(start, end, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("w: subsequence (%d, %d): %w", start, end, err) + } + + endBifurcation := invalidStructure + for k := start + 1; k < end-1; k++ { + testBranch, err := multibranch(start, k, end, foldContext, false) + if err != nil { + return defaultStructure, fmt.Errorf("w: subsequence (%d, %d): %w", start, end, err) + } + + if testBranch.Valid() && testBranch.energy < endBifurcation.energy { + endBifurcation = testBranch + } + } + + minStuctEnergy := minimumStructure(endDanglingLeft, endDanglingRight, endsPaired, endBifurcation) + foldContext.unpairedMinimumFreeEnergyW[start][end] = minStuctEnergy + return minStuctEnergy, nil +} + +// pairedMinimumFreeEnergyV returns the minimum free energy of a subsequence of +// paired bases and end. +// From Figure 2B of Zuker, 1981: let V(i,j) be the minimum free energy of all +// possible admissible structures formed from Sij in which Si and Sj base pair +// with each other. If Si and Sj cannot base pair, then V(i,j) = infinity +// +// If start and end don't bp, store and return INF. +// See: Figure 2B of Zuker, 1981 +// Args: +// +// start: The start index +// end: The end index (inclusive) +// foldContext: The context for this sequence +// +// Returns the minimum energy folding structure possible between start and end on seq +func pairedMinimumFreeEnergyV(start, end int, foldContext context) (nucleicAcidStructure, error) { + if !foldContext.pairedMinimumFreeEnergyV[start][end].Equal(defaultStructure) { + return foldContext.pairedMinimumFreeEnergyV[start][end], nil + } + + // the ends must basepair for pairedMinimumFreeEnergyV(start,end) + if foldContext.energies.complement(rune(foldContext.seq[start])) != rune(foldContext.seq[end]) { + foldContext.pairedMinimumFreeEnergyV[start][end] = invalidStructure + return foldContext.pairedMinimumFreeEnergyV[start][end], nil + } + // if the basepair is isolated, and the seq large, penalize at 1,600 kcal/mol + // heuristic for speeding this up + // from https://www.ncbi.nlm.nih.gov/pubmed/10329189 + isolatedOuter := true + if start > 0 && end < len(foldContext.seq)-1 { + isolatedOuter = foldContext.energies.complement(rune(foldContext.seq[start-1])) != rune(foldContext.seq[end+1]) + } + isolatedInner := foldContext.energies.complement(rune(foldContext.seq[start+1])) != rune(foldContext.seq[end-1]) + + if isolatedOuter && isolatedInner { + foldContext.pairedMinimumFreeEnergyV[start][end] = nucleicAcidStructure{energy: isolatedBasePairPenalty} + return foldContext.pairedMinimumFreeEnergyV[start][end], nil + } + + paired := pair(foldContext.seq, start, start+1, end, end-1) + hairpin, err := hairpin(start, end, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("v: subsequence (%d, %d): %w", start, end, err) + } + e1 := nucleicAcidStructure{energy: hairpin, description: "HAIRPIN:" + paired} + if end-start == minLenForStruct { // small hairpin; 4bp + foldContext.pairedMinimumFreeEnergyV[start][end] = e1 + foldContext.unpairedMinimumFreeEnergyW[start][end] = e1 + return foldContext.pairedMinimumFreeEnergyV[start][end], nil + } + + n := len(foldContext.seq) + e2 := nucleicAcidStructure{energy: math.Inf(1)} + for rightOfStart := start + 1; rightOfStart < end-minLenForStruct; rightOfStart++ { + for leftOfEnd := rightOfStart + minLenForStruct; leftOfEnd < end; leftOfEnd++ { + // rightOfStart and leftOfEnd must match + if foldContext.energies.complement(rune(foldContext.seq[rightOfStart])) != rune(foldContext.seq[leftOfEnd]) { + continue + } + + paired := pair(foldContext.seq, start, rightOfStart, end, leftOfEnd) + pairLeft := pair(foldContext.seq, start, start+1, end, end-1) + pairRight := pair(foldContext.seq, rightOfStart-1, rightOfStart, leftOfEnd+1, leftOfEnd) + _, pairLeftInner := foldContext.energies.nearestNeighbors[pairLeft] + _, pairRightInner := foldContext.energies.nearestNeighbors[pairRight] + pairInner := pairLeftInner || pairRightInner + + isStack := rightOfStart == start+1 && leftOfEnd == end-1 + bulgeLeft := rightOfStart > start+1 + bulgeRight := leftOfEnd < end-1 + + var ( + e2Test float64 + e2TestType string + err error + ) + switch { + case isStack: + // it's a neighboring/stacking pair in a helix + e2Test = stack(start, rightOfStart, end, leftOfEnd, foldContext) + e2TestType = fmt.Sprintf("STACK:%s", paired) + + if start > 0 && end == n-1 || start == 0 && end < n-1 { + // there's a dangling end + e2TestType = fmt.Sprintf("STACKDanglingEnds:%s", paired) + } + case bulgeLeft && bulgeRight && !pairInner: + // it's an interior loop + il, err := internalLoop(start, rightOfStart, end, leftOfEnd, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("v: subsequence (%d, %d): %w", start, end, err) + } + e2Test = il + e2TestType = fmt.Sprintf("INTERIOR_LOOP:%d/%d", rightOfStart-start, end-leftOfEnd) + + if rightOfStart-start == 2 && end-leftOfEnd == 2 { + loopLeftIndex := foldContext.seq[start : rightOfStart+1] + loopRightIndex := foldContext.seq[leftOfEnd : end+1] + // technically an interior loop of 1. really 1bp mismatch + e2TestType = fmt.Sprintf("STACK:%s/%s", loopLeftIndex, transform.Reverse(loopRightIndex)) + } + case bulgeLeft && !bulgeRight: + // it's a bulge on the left side + e2Test, err = Bulge(start, rightOfStart, end, leftOfEnd, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("v: subsequence (%d, %d): %w", start, end, err) + } + e2TestType = fmt.Sprintf("BULGE:%d", rightOfStart-start) + case !bulgeLeft && bulgeRight: + // it's a bulge on the right side + e2Test, err = Bulge(start, rightOfStart, end, leftOfEnd, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("v: subsequence (%d, %d): %w", start, end, err) + } + e2TestType = fmt.Sprintf("BULGE:%d", end-leftOfEnd) + default: + // it's basically a hairpin, only outside bp match + continue + } + + // add pairedMinimumFreeEnergyV(start', end') + tv, err := pairedMinimumFreeEnergyV(rightOfStart, leftOfEnd, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("v: subsequence (%d, %d): %w", start, end, err) + } + e2Test += tv.energy + if e2Test != math.Inf(-1) && e2Test < e2.energy { + e2 = nucleicAcidStructure{energy: e2Test, description: e2TestType, inner: []subsequence{{rightOfStart, leftOfEnd}}} + } + } + } + + e3 := invalidStructure + if !isolatedOuter || start == 0 || end == len(foldContext.seq)-1 { + for k := start + 1; k < end-1; k++ { + e3Test, err := multibranch(start, k, end, foldContext, true) + if err != nil { + return defaultStructure, fmt.Errorf("v: subsequence (%d, %d): %w", start, end, err) + } + + if e3Test.Valid() && e3Test.energy < e3.energy { + e3 = e3Test + } + } + } + e := minimumStructure(e1, e2, e3) + foldContext.pairedMinimumFreeEnergyV[start][end] = e + return e, nil +} + +// Bulge calculates the free energy associated with a bulge. +// +// Args: +// +// start: The start index of the bulge +// rightOfStart: The index to the right of start +// end: The end index of the bulge +// leftOfEnd: The index to the left of end +// foldContext: The FoldingContext for this sequence +// +// Returns the increment in free energy from the bulge +func Bulge(start, rightOfStart, end, leftOfEnd int, foldContext context) (float64, error) { + loopLength := max(rightOfStart-start-1, end-leftOfEnd-1) + if loopLength <= 0 { + return 0, fmt.Errorf("bulge: the length of the bulge at (%d, %d) is %d", start, end, loopLength) + } + + var dG float64 + + // add penalty based on size + if foldEnergy, ok := foldContext.energies.bulgeLoops[loopLength]; ok { + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + dG = deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } else { + // it's too large for pre-calculated list, extrapolate + foldEnergy := foldContext.energies.bulgeLoops[maxLenPreCalulated] + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + dG = deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + dG = jacobsonStockmayer(loopLength, maxLenPreCalulated, dG, foldContext.temp) + } + + if loopLength == 1 { + // if len 1, include the delta G of intervening nearestNeighbors (SantaLucia 2004) + paired := pair(foldContext.seq, start, rightOfStart, end, leftOfEnd) + if _, ok := foldContext.energies.nearestNeighbors[paired]; !ok { + return 0, fmt.Errorf("bulge: paired %q not in the nearestNeighbors energies", paired) + } + dG += stack(start, rightOfStart, end, leftOfEnd, foldContext) + } + + // penalize AT terminal bonds + for _, k := range []int{start, rightOfStart, end, leftOfEnd} { + if foldContext.seq[k] == 'A' { + dG += closingATPenalty + } + } + + return dG, nil +} + +func addBranch(structure nucleicAcidStructure, branches *[]subsequence, foldContext context) error { + if !structure.Valid() || len(structure.inner) == 0 { + return nil + } + if len(structure.inner) == 1 { + *branches = append(*branches, structure.inner[0]) + return nil + } + for _, inner := range structure.inner { + structure, err := unpairedMinimumFreeEnergyW(inner.start, inner.end, foldContext) + if err != nil { + return err + } + err = addBranch(structure, branches, foldContext) + if err != nil { + return err + } + } + return nil +} + +// multibranch calculates a multi-branch foldEnergy penalty using a linear formula. +// +// From Jaeger, Turner, and Zuker, 1989. +// Found to be better than logarithmic in Ward, et al. 2017 +// Args: +// +// start: The left starting index +// mid: The mid-point in the search +// end: The right ending index +// foldContext: The FoldingContext for this sequence +// helix: Whether this multibranch is enclosed by a helix +// helix: Whether pairedMinimumFreeEnergyV(start, end) bond with one another in a helix +// +// Returns a multi-branch structure +func multibranch(start, mid, end int, foldContext context, helix bool) (nucleicAcidStructure, error) { + var ( + left, right nucleicAcidStructure + err error + ) + if helix { + left, err = unpairedMinimumFreeEnergyW(start+1, mid, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): %w", start, end, mid, err) + } + right, err = unpairedMinimumFreeEnergyW(mid+1, end-1, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): %w", start, end, mid, err) + } + } else { + left, err = unpairedMinimumFreeEnergyW(start, mid, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): %w", start, end, mid, err) + } + right, err = unpairedMinimumFreeEnergyW(mid+1, end, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): %w", start, end, mid, err) + } + } + + if !left.Valid() || !right.Valid() { + return invalidStructure, nil + } + + // gather all branches of this multi-branch structure + var branches []subsequence + + // in python this was a recursive closure, in Go this is not possible so + // we pull it out and pass all the parameters + err = addBranch(left, &branches, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): %w", start, mid, end, err) + } + err = addBranch(right, &branches, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): %w", start, mid, end, err) + } + + // this isn't multi-branched + if len(branches) < 2 { + return invalidStructure, nil + } + + // if there's a helix, start,end counts as well + if helix { + branches = append(branches, subsequence{start, end}) + } + + // count up unpaired bp and asymmetry + branchCount := len(branches) + unpaired := 0 + summedEnergy := 0.0 + curSequence := subsequence{start, end} + for index, currentBranch := range branches { + leftStart, leftEnd := currentBranch.start, currentBranch.end + curBranchLeft := branches[abs((index-1)%len(branches))] + leftOfEnd := curBranchLeft.end + curBranchRight := branches[abs((index+1)%len(branches))] + rightStart, rightEnd := curBranchRight.start, curBranchRight.end + + // add foldEnergy from unpaired bp to the right + // of the helix as though it was a dangling end + // if there's only one bp, it goes to whichever + // helix (this or the next) has the more favorable energy + unpairedLeft := 0 + unpairedRight := 0 + danglingEnergy := 0.0 + if index == len(branches)-1 && !helix { + // pass + } else if curBranchRight == curSequence { + unpairedLeft = leftStart - leftOfEnd - 1 + unpairedRight = rightEnd - leftEnd - 1 + + if unpairedLeft != 0 && unpairedRight != 0 { + danglingEnergy = stack(leftStart-1, leftStart, leftEnd+1, leftEnd, foldContext) + } else if unpairedRight != 0 { + danglingEnergy = stack(-1, leftStart, leftEnd+1, leftEnd, foldContext) + if unpairedRight == 1 { + danglingEnergy = math.Min(stack(rightStart, -1, rightEnd, rightEnd-1, foldContext), danglingEnergy) + } + } + } else if currentBranch == curSequence { + unpairedLeft = leftEnd - leftOfEnd - 1 + unpairedRight = rightStart - leftStart - 1 + + if unpairedLeft != 0 && unpairedRight != 0 { + danglingEnergy = stack(leftStart-1, leftStart, leftEnd+1, leftEnd, foldContext) + } else if unpairedRight != 0 { + danglingEnergy = stack(leftStart, leftStart+1, leftEnd, -1, foldContext) + if unpairedRight == 1 { + danglingEnergy = math.Min(stack(rightStart-1, rightStart, -1, rightEnd, foldContext), danglingEnergy) + } + } + } else { + unpairedLeft = leftStart - leftOfEnd - 1 + unpairedRight = rightStart - leftEnd - 1 + + if unpairedLeft != 0 && unpairedRight != 0 { + danglingEnergy = stack(leftStart-1, leftStart, leftEnd+1, leftEnd, foldContext) + } else if unpairedRight != 0 { + danglingEnergy = stack(-1, leftStart, leftEnd+1, leftEnd, foldContext) + if unpairedRight == 1 { + danglingEnergy = math.Min(stack(leftStart-1, leftStart, leftEnd+1, leftEnd, foldContext), danglingEnergy) + } + } + } + summedEnergy += danglingEnergy + unpaired += unpairedRight + if unpairedRight < 0 { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): unpairedRight < 0", start, end, mid) + } + + if currentBranch != curSequence { // add energy + w, err := unpairedMinimumFreeEnergyW(leftStart, leftEnd, foldContext) + if err != nil { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): %w", start, end, mid, err) + } + summedEnergy += w.energy + } + + } + + if unpaired < 0 { + return defaultStructure, fmt.Errorf("multibranch: subsequence (%d, %d, %d): unpaired < 0", start, end, mid) + } + + // this is just for readability of the formulas below + var ( + helicesCount = foldContext.energies.multibranch.helicesCount + unpairedCount = foldContext.energies.multibranch.unpairedCount + coaxialStackCount = foldContext.energies.multibranch.coaxialStackCount + terminalMismatchCount = foldContext.energies.multibranch.terminalMismatchCount + ) + + // penalty for unmatched bp and multi-branch + multibranchEnergy := helicesCount + unpairedCount*float64(len(branches)) + coaxialStackCount*float64(unpaired) + + if unpaired == 0 { + multibranchEnergy = helicesCount + terminalMismatchCount + } + + // energy of min-energy neighbors + e := multibranchEnergy + summedEnergy + + // pointer to next structures + if helix { + // branches.pop() + branches = branches[:len(branches)-1] + } + + return nucleicAcidStructure{energy: e, description: fmt.Sprintf("BIFURCATION:%dn/%dh", unpaired, branchCount), inner: branches}, nil +} + +// internalLoop calculates the free energy of an internal loop. +// +// The first and last bp of both left and right sequences +// are not themselves parts of the loop, but are the terminal +// bp on either side of it. They are needed for when there's +// a single internal looping bp (where just the mismatching +// free energies are used) +// Note that both left and right sequences are in 5' to 3' direction +// This is adapted from the "Internal Loops" section of SantaLucia/Hicks, 2004 +// Args: +// +// start: The index of the start of structure on left side +// rightOfStart: The index to the right of start +// end: The index of the end of structure on right side +// leftOfEnd: The index to the left of end +// foldContext: The FoldingContext for this sequence +// +// Returns the free energy associated with the internal loop +func internalLoop(start, rightOfStart, end, leftOfEnd int, foldContext context) (float64, error) { + loopLeftIndex := rightOfStart - start - 1 + loopRightIndex := end - leftOfEnd - 1 + loopLength := loopLeftIndex + loopRightIndex + + if loopLeftIndex < 1 { + return 0, fmt.Errorf("internal loop: subsequence (%d, %d, %d, %d): missing left part of the loop", start, rightOfStart, end, leftOfEnd) + + } + if loopRightIndex < 1 { + return 0, fmt.Errorf("internal loop: subsequence (%d, %d, %d, %d): missing right part of the loop", start, rightOfStart, end, leftOfEnd) + } + + // single bp mismatch, sum up the two single mismatch pairs + if loopLeftIndex == 1 && loopRightIndex == 1 { + mismatchLeftEnergy := stack(start, rightOfStart, end, leftOfEnd, foldContext) + mismatchRightEnergy := stack(rightOfStart-1, rightOfStart, leftOfEnd+1, leftOfEnd, foldContext) + return mismatchLeftEnergy + mismatchRightEnergy, nil + } + var enthalpyHDifference, entropySDifference, dG float64 + // apply a penalty based on loop size + if foldEnergy, ok := foldContext.energies.internalLoops[loopLength]; ok { + enthalpyHDifference, entropySDifference = foldEnergy.enthalpyH, foldEnergy.entropyS + dG = deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } else { + // it's too large an internal loop, extrapolate + foldEnergy := foldContext.energies.internalLoops[maxLenPreCalulated] + enthalpyHDifference, entropySDifference = foldEnergy.enthalpyH, foldEnergy.entropyS + dG = deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + dG = jacobsonStockmayer(loopLength, maxLenPreCalulated, dG, foldContext.temp) + } + + // apply an asymmetry penalty + loopAsymmetry := math.Abs(float64(loopLeftIndex - loopRightIndex)) + dG += loopsAsymmetryPenalty * loopAsymmetry + + // apply penalty based on the mismatching pairs on either side of the loop + pairedMismatchLeftEnergy := pair(foldContext.seq, start, start+1, end, end-1) + foldEnergy := foldContext.energies.terminalMismatches[pairedMismatchLeftEnergy] + enthalpyHDifference, entropySDifference = foldEnergy.enthalpyH, foldEnergy.entropyS + dG += deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + + pairedMismatchRightEnergy := pair(foldContext.seq, rightOfStart-1, rightOfStart, leftOfEnd+1, leftOfEnd) + foldEnergy = foldContext.energies.terminalMismatches[pairedMismatchRightEnergy] + enthalpyHDifference, entropySDifference = foldEnergy.enthalpyH, foldEnergy.entropyS + dG += deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + + return dG, nil +} + +// stack returns the free energy of a stack. +// +// Using the indexes start and end, check whether it's at the end of +// the sequence or internal. Then check whether it's a match +// or mismatch, and return. +// Two edge-cases are terminal mismatches and dangling ends. +// The energy of a dangling end is added to the energy of a pair +// where start XOR end is at the sequence's end. +// Args: +// +// start: The start index on left side of the pair/stack +// rightOfStart: The index to the right of start +// end: The end index on right side of the pair/stack +// leftOfEnd: The index to the left of end +// foldContext: The FoldingContext for this sequence +// +// Returns the free energy of the nearestNeighbors pairing +func stack(start, rightOfStart, end, leftOfEnd int, foldContext context) float64 { + // if any(x >= len(seq) for x in [start,rightOfStart, end, leftOfEnd]): + // return 0.0 + for _, indices := range []int{start, rightOfStart, end, leftOfEnd} { + if indices >= len(foldContext.seq) { + return 0 + } + } + + paired := pair(foldContext.seq, start, rightOfStart, end, leftOfEnd) + // if any(x == -1 for x in [start,rightOfStart, end, leftOfEnd]): + for _, indices := range []int{start, rightOfStart, end, leftOfEnd} { + if indices == -1 { + // it's a dangling end + foldEnergy := foldContext.energies.danglingEnds[paired] + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + return deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } + } + + if start > 0 && end < len(foldContext.seq)-1 { + // it's internal + foldEnergy, ok := foldContext.energies.nearestNeighbors[paired] + if !ok { + foldEnergy = foldContext.energies.internalMismatches[paired] + } + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + return deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } + if start == 0 && end == len(foldContext.seq)-1 { + // it's terminal + foldEnergy, ok := foldContext.energies.nearestNeighbors[paired] + if !ok { + foldEnergy = foldContext.energies.internalMismatches[paired] + } + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + return deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } + + if start > 0 && end == len(foldContext.seq)-1 { + // it's dangling on left + foldEnergy, ok := foldContext.energies.nearestNeighbors[paired] + if !ok { + foldEnergy = foldContext.energies.internalMismatches[paired] + } + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + dG := deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + + pairDanglingEnds := fmt.Sprintf("%c%c/.%c", foldContext.seq[start-1], foldContext.seq[start], foldContext.seq[end]) + if foldEnergy, ok := foldContext.energies.danglingEnds[pairDanglingEnds]; ok { + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + dG += deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } + return dG + } + + if start == 0 && end < len(foldContext.seq)-1 { + // it's dangling on right + foldEnergy, ok := foldContext.energies.nearestNeighbors[paired] + if !ok { + foldEnergy = foldContext.energies.internalMismatches[paired] + } + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + dG := deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + + pairDanglingEnds := fmt.Sprintf(".%c/%c%c", +foldContext.seq[start], foldContext.seq[end+1], foldContext.seq[end]) + if foldEnergy, ok := foldContext.energies.danglingEnds[pairDanglingEnds]; ok { + enthalpyHDifference, entropySDifference := foldEnergy.enthalpyH, foldEnergy.entropyS + dG += deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + return dG + } + } + return 0 +} + +// hairpin calculates the free energy of a hairpin. +// Args: +// +// start: The index of start of hairpin +// end: The index of end of hairpin +// foldContext: The FoldingContext for this sequence +// +// Returns the free energy increment from the hairpin structure +func hairpin(start, end int, foldContext context) (float64, error) { + if end-start < minLenForStruct { + return math.Inf(1), nil + } + + hairpinSeq := foldContext.seq[start : end+1] + hairpinLength := len(hairpinSeq) - 2 + paired := pair(foldContext.seq, start, start+1, end, end-1) + + if foldContext.energies.complement(rune(hairpinSeq[0])) != rune(hairpinSeq[len(hairpinSeq)-1]) { + // not known terminal pair, nothing to close "hairpin" + return 0, fmt.Errorf("hairpin: subsequence (%d, %d): unknown hairpin terminal pairing %c - %c", start, end, hairpinSeq[0], hairpinSeq[len(hairpinSeq)-1]) + } + + dG := 0.0 + if foldContext.energies.triTetraLoops != nil { + if energy, ok := foldContext.energies.triTetraLoops[hairpinSeq]; ok { + // it's a pre-known hairpin with known value + enthalpyHDifference, entropySDifference := energy.enthalpyH, energy.entropyS + dG = deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } + } + + // add penalty based on size + if energy, ok := foldContext.energies.hairpinLoops[hairpinLength]; ok { + enthalpyHDifference, entropySDifference := energy.enthalpyH, energy.entropyS + dG += deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } else { + // it's too large, extrapolate + energy := foldContext.energies.hairpinLoops[maxLenPreCalulated] + enthalpyHDifference, entropySDifference := energy.enthalpyH, energy.entropyS + dGinc := deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + dG += jacobsonStockmayer(hairpinLength, maxLenPreCalulated, dGinc, foldContext.temp) + } + + // add penalty for a terminal mismatch + energy, ok := foldContext.energies.terminalMismatches[paired] + if hairpinLength > 3 && ok { + enthalpyHDifference, entropySDifference := energy.enthalpyH, energy.entropyS + dG += deltaG(enthalpyHDifference, entropySDifference, foldContext.temp) + } + + // add penalty if length 3 and AT closing, formula 8 from SantaLucia, 2004 + if hairpinLength == 3 && (hairpinSeq[0] == 'A' || hairpinSeq[len(hairpinSeq)-1] == 'A') { + dG += closingATPenalty // convert to entropy + } + + return dG, nil +} + +// Find the free energy given delta h, s and temp +// Args: +// +// enthalpyHDifference: The enthalpy increment in kcal / mol +// entropySDifference: The entropy increment in cal / mol +// temp: The temperature in Kelvin +// +// Returns the free energy increment in kcal / (mol x K) +func deltaG(enthalpyHDifference, entropySDifference, temp float64) float64 { + return enthalpyHDifference - temp*(entropySDifference/1000.0) +} + +// jacobsonStockmayer entropy extrapolation formula is used for bulges, +// hairpins, etc that fall outside the maxLenPreCalulated upper limit for +// pre-calculated free-energies. See SantaLucia and Hicks (2004) +// +// Args: +// +// queryLen: Length of element without known free energy value +// knownLen: Length of element with known free energy value (dGx) +// dGx: The free energy of the element knownLen +// temp: Temperature in Kelvin +// +// Returns the free energy for a structure of length queryLen +// See SantaLucia and Hicks (2004). +// NOTE: the coefficient 2.44 is based on recent kinetics +// measurements in DNA (Goddard NL, Bonnet G, Krichevsky O, Libchaber A. 2000), +// and thus it is used in preference to the older theoretically derived value +// of 1.75. +func jacobsonStockmayer(queryLen, knownLen int, dGx, temp float64) float64 { + gasConstant := 1.9872e-3 + return dGx + 2.44*gasConstant*temp*math.Log(float64(queryLen)/float64(knownLen)) +} + +// pair Returns a stack representation, a key for the nearestNeighbors maps +// Args: +// +// s: Sequence being folded +// start: leftmost index +// rightOfStart: index to right of start +// end: rightmost index +// leftOfEnd: index to left of end +// +// Returns string representation of the pair +func pair(s string, start, rightOfStart, end, leftOfEnd int) string { + seqRunes := []rune(s) + ret := []rune{'.', '.', '/', '.', '.'} + if start >= 0 { + ret[0] = seqRunes[start] + } + if rightOfStart >= 0 { + ret[1] = seqRunes[rightOfStart] + } + if end >= 0 { + ret[3] = seqRunes[end] + } + if leftOfEnd >= 0 { + ret[4] = seqRunes[leftOfEnd] + } + return string(ret) +} + +// Traceback thru the pairedMinimumFreeEnergyV(start,end) and unpairedMinimumFreeEnergyW(start,end) caches to find the structure +// For each step, get to the lowest energy unpairedMinimumFreeEnergyW(start,end) within that block +// Store the structure in unpairedMinimumFreeEnergyW(start,end) +// Inc start and end +// If the next structure is viable according to pairedMinimumFreeEnergyV(start,end), store as well +// Repeat +// Args: +// +// start: The leftmost index to start searching in +// end: The rightmost index to start searching in +// foldContext: The FoldingContext for this sequence +// +// Returns a list of NucleicAcidStructure in the final secondary structure +func traceback(start, end int, foldContext context) []nucleicAcidStructure { + // move start,end down-left to start coordinates + structure := foldContext.unpairedMinimumFreeEnergyW[start][end] + if !strings.Contains(structure.description, "HAIRPIN") { + for foldContext.unpairedMinimumFreeEnergyW[start+1][end].Equal(structure) { + start += 1 + } + for foldContext.unpairedMinimumFreeEnergyW[start][end-1].Equal(structure) { + end -= 1 + } + } + + NucleicAcidStructures := []nucleicAcidStructure{} + for { + structure = foldContext.pairedMinimumFreeEnergyV[start][end] + + NucleicAcidStructures = append(NucleicAcidStructures, nucleicAcidStructure{energy: structure.energy, description: structure.description, inner: []subsequence{{start: start, end: end}}}) + + // it's a hairpin, end of structure + if len(structure.inner) == 0 { + // set the energy of everything relative to the hairpin + return trackbackEnergy(NucleicAcidStructures) + } + + // it's a stack, bulge, etc + // there's another single structure beyond this + if len(structure.inner) == 1 { + start, end = structure.inner[0].start, structure.inner[0].end + // subsequence = structure.IJs[0] + continue + } + + // it's a multibranch + summedEnergy := 0.0 + NucleicAcidStructures = trackbackEnergy(NucleicAcidStructures) + branches := []nucleicAcidStructure{} + for _, subseq := range structure.inner { + rightOfStart, leftOfEnd := subseq.start, subseq.end + tb := traceback(rightOfStart, leftOfEnd, foldContext) + if len(tb) > 0 && len(tb[0].inner) > 0 { + subseq := tb[0].inner[0] + subStart, subEnd := subseq.start, subseq.end + summedEnergy += foldContext.unpairedMinimumFreeEnergyW[subStart][subEnd].energy + branches = append(branches, tb...) + } + } + + NucleicAcidStructures[len(NucleicAcidStructures)-1].energy -= summedEnergy + return append(NucleicAcidStructures, branches...) + } +} + +// Return the struct with the lowest free energy that isn't -inf +// Args: +// +// structures: NucleicAcidStructure being compared +// +// Returns the min free energy structure +func minimumStructure(structures ...nucleicAcidStructure) nucleicAcidStructure { + minimumStructure := invalidStructure + for _, structure := range structures { + if structure.energy != math.Inf(-1) && structure.energy < minimumStructure.energy { + minimumStructure = structure + } + } + return minimumStructure +} + +// trackbackEnergy add energy to each structure, based on how it's +// unpairedMinimumFreeEnergyW(start,end) differs from the one after +// Args: +// +// structures: The NucleicAcidStructure for whom energy is being calculated +// +// Returns a slice of NucleicAcidStructure in the folded DNA with energy +func trackbackEnergy(structures []nucleicAcidStructure) []nucleicAcidStructure { + for start := 0; start < len(structures)-1; start++ { + structures[start].energy -= structures[start+1].energy + } + return structures +} diff --git a/fold/fold_test.go b/fold/fold_test.go new file mode 100644 index 00000000..8be9275f --- /dev/null +++ b/fold/fold_test.go @@ -0,0 +1,203 @@ +package fold + +import ( + "math" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestFold(t *testing.T) { + t.Run("FoldCache", func(t *testing.T) { + seq := "ATGGATTTAGATAGAT" + foldContext, err := newFoldingContext(seq, 37.0) + require.NoError(t, err) + res, err := Zuker(seq, 37.0) + require.NoError(t, err) + seqDg := res.MinimumFreeEnergy() + require.NoError(t, err) + + assert.InDelta(t, seqDg, foldContext.unpairedMinimumFreeEnergyW[0][len(seq)-1].energy, 1) + }) + t.Run("FoldDNA", func(t *testing.T) { + // unafold's estimates for free energy estimates of DNA oligos + unafoldDgs := map[string]float64{ + "GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC": -10.94, // three branched structure + "GGGAGGTCGCTCCAGCTGGGAGGAGCGTTGGGGGTATATACCCCCAACACCGGTACTGATCCGGTGACCTCCC": -23.4, // four branched structure + "CGCAGGGAUACCCGCG": -3.8, + "TAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGT": -6.85, + "GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTCTGCGGTTCGATCCCGCGCGCTCCCACCA": -15.50, + "TGAGACGGAAGGGGATGATTGTCCCCTTCCGTCTCA": -18.10, + "ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA": -3.65, + } + + for seq, ufold := range unafoldDgs { + res, err := Zuker(seq, 37.0) + require.NoError(t, err) + d := res.MinimumFreeEnergy() + // accepting a 60% difference + delta := math.Abs(0.6 * math.Min(d, ufold)) + assert.InDeltaf(t, d, ufold, delta, seq) + } + }) + t.Run("FoldRNA", func(t *testing.T) { + // unafold's estimates for free energy estimates of RNA oligos + // most tests available at https://github.com/jaswindersingh2/SPOT-RNA/blob/master/sample_inputs/batch_seq.fasta + unafoldDgs := map[string]float64{ + "ACCCCCUCCUUCCUUGGAUCAAGGGGCUCAA": -9.5, + "AAGGGGUUGGUCGCCUCGACUAAGCGGCUUGGAAUUCC": -10.1, + "UUGGAGUACACAACCUGUACACUCUUUC": -4.3, + "AGGGAAAAUCCC": -3.3, + "GCUUACGAGCAAGUUAAGCAAC": -4.6, + "UGGGAGGUCGUCUAACGGUAGGACGGCGGACUCUGGAUCCGCUGGUGGAGGUUCGAGUCCUCCCCUCCCAGCCA": -32.8, + "GGGCGAUGAGGCCCGCCCAAACUGCCCUGAAAAGGGCUGAUGGCCUCUACUG": -20.7, + "GGGGGCAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCGCGCUCCCACCA": -31.4, + } + + for seq, ufold := range unafoldDgs { + res, err := Zuker(seq, 37.0) + require.NoError(t, err) + d := res.MinimumFreeEnergy() + + // accepting a 50% difference + delta := math.Abs(0.5 * math.Min(d, ufold)) + assert.InDeltaf(t, d, ufold, delta, seq) + } + }) + t.Run("DotBracket", func(t *testing.T) { + seq := "GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC" + res, err := Zuker(seq, 37.0) + require.NoError(t, err) + + assert.Equal(t, "((((((((.((((......))))..((((.......)))).))))))))", res.DotBracket()) + }) + t.Run("multibranch", func(t *testing.T) { + seq := "GGGAGGTCGTTACATCTGGGTAACACCGGTACTGATCCGGTGACCTCCC" // three branch + + res, err := Zuker(seq, 37.0) + require.NoError(t, err) + + found := false + foundIJ := subsequence{7, 41} + for _, s := range res.structs { + if strings.Contains(s.description, "BIFURCATION") { + for _, ij := range s.inner { + if ij == foundIJ { + found = true + } + } + } + } + assert.True(t, found, "not found a BIFURCATION with (7, 41) in ij") + }) + t.Run("pair", func(t *testing.T) { + seq := "ATGGAATAGTG" + assert.Equal(t, pair(seq, 0, 1, 9, 10), "AT/TG") + }) + t.Run("stack", func(t *testing.T) { + seq := "GCUCAGCUGGGAGAGC" + temp := 37.0 + foldContext, err := newFoldingContext(seq, temp) + require.NoError(t, err) + + e := stack(1, 2, 14, 13, foldContext) + assert.InDelta(t, e, -2.1, 0.1) + }) + t.Run("Bulge", func(t *testing.T) { + // mock bulge of CAT on one side and AG on other + // from pg 429 of SantaLucia, 2004 + seq := "ACCCCCATCCTTCCTTGAGTCAAGGGGCTCAA" + foldContext, err := newFoldingContext(seq, 37) + require.NoError(t, err) + + pairDg, err := Bulge(5, 7, 18, 17, foldContext) + require.NoError(t, err) + assert.InDelta(t, pairDg, 3.22, 0.4) + }) + t.Run("hairpin", func(t *testing.T) { + // hairpin = "CCTTGG" + seq := "ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA" + i := 11 + j := 16 + + foldContext, err := newFoldingContext(seq, 37) + require.NoError(t, err) + + hairpinDg, err := hairpin(i, j, foldContext) + require.NoError(t, err) + // this differs from Unafold + assert.InDelta(t, hairpinDg, 4.3, 1.0) + + // from page 428 of SantaLucia, 2004 + // hairpin = "CGCAAG" + seq = "ACCCGCAAGCCCTCCTTCCTTGGATCAAGGGGCTCAA" + i = 3 + j = 8 + + foldContext, err = newFoldingContext(seq, 37) + require.NoError(t, err) + + hairpinDg, err = hairpin(i, j, foldContext) + require.NoError(t, err) + assert.InDelta(t, hairpinDg, 0.67, 0.1) + + seq = "CUUUGCACG" + i = 0 + j = 8 + + foldContext, err = newFoldingContext(seq, 37) + require.NoError(t, err) + + hairpinDg, err = hairpin(i, j, foldContext) + require.NoError(t, err) + assert.InDelta(t, hairpinDg, 4.5, 0.2) + }) + t.Run("internalLoop", func(t *testing.T) { + seq := "ACCCCCTCCTTCCTTGGATCAAGGGGCTCAA" + i := 6 + j := 21 + + foldContext, err := newFoldingContext(seq, 37) + require.NoError(t, err) + + dg, err := internalLoop(i, i+4, j, j-4, foldContext) + require.NoError(t, err) + assert.InDelta(t, dg, 3.5, 0.1) + }) + t.Run("W", func(t *testing.T) { + seq := "GCUCAGCUGGGAGAGC" + i := 0 + j := 15 + + foldContext, err := newFoldingContext(seq, 37) + require.NoError(t, err) + + struc, err := unpairedMinimumFreeEnergyW(i, j, foldContext) + require.NoError(t, err) + assert.InDelta(t, struc.energy, -3.8, 0.2) + + seq = "CCUGCUUUGCACGCAGG" + i = 0 + j = 16 + + foldContext, err = newFoldingContext(seq, 37) + require.NoError(t, err) + + struc, err = unpairedMinimumFreeEnergyW(i, j, foldContext) + require.NoError(t, err) + assert.InDelta(t, struc.energy, -6.4, 0.2) + + seq = "GCGGUUCGAUCCCGC" + i = 0 + j = 14 + + foldContext, err = newFoldingContext(seq, 37) + require.NoError(t, err) + + struc, err = unpairedMinimumFreeEnergyW(i, j, foldContext) + require.NoError(t, err) + assert.InDelta(t, struc.energy, -4.2, 0.2) + }) +} diff --git a/fold/rna.go b/fold/rna.go new file mode 100644 index 00000000..a718b50a --- /dev/null +++ b/fold/rna.go @@ -0,0 +1,899 @@ +package fold + +import "github.com/TimothyStiles/poly/transform" + +var rnaComplement = transform.ComplementBaseRNA + +var rnaMultibranch = multibranchEnergies{2.5, 0.1, 0.4, 2.0} + +var rnaNearestNeighbors = matchingBasepairEnergy{ + "AA/UU": {enthalpyH: -6.8, entropyS: -19}, + "AC/UG": {enthalpyH: -11.4, entropyS: -29.7}, + "AG/UC": {enthalpyH: -10.5, entropyS: -27.1}, + "AU/UA": {enthalpyH: -9.4, entropyS: -26.8}, + "CA/GU": {enthalpyH: -10.4, entropyS: -26.8}, + "CC/GG": {enthalpyH: -13.4, entropyS: -32.6}, + "CG/GC": {enthalpyH: -10.6, entropyS: -26.4}, + "CU/GA": {enthalpyH: -10.5, entropyS: -27.1}, + "GA/CU": {enthalpyH: -12.4, entropyS: -32.2}, + "GC/CG": {enthalpyH: -14.9, entropyS: -37.1}, + "GG/CC": {enthalpyH: -13.4, entropyS: -32.6}, + "GU/CA": {enthalpyH: -11.4, entropyS: -29.7}, + "UA/AU": {enthalpyH: -7.7, entropyS: -20.6}, + "UC/AG": {enthalpyH: -12.4, entropyS: -32.2}, + "UG/AC": {enthalpyH: -10.4, entropyS: -26.8}, + "UU/AA": {enthalpyH: -6.8, entropyS: -19}, +} + +var rnaInternalMismatches = matchingBasepairEnergy{ + "AA/AA": {enthalpyH: 0, entropyS: 0}, + "AA/AC": {enthalpyH: 0, entropyS: 0}, + "AA/AG": {enthalpyH: 0, entropyS: 0}, + "AA/AU": {enthalpyH: 0, entropyS: 0}, + "AA/CA": {enthalpyH: 0, entropyS: 0}, + "AA/CC": {enthalpyH: 0, entropyS: 0}, + "AA/CG": {enthalpyH: 0, entropyS: 0}, + "AA/CU": {enthalpyH: 0, entropyS: 0}, + "AA/GA": {enthalpyH: 0, entropyS: 0}, + "AA/GC": {enthalpyH: 0, entropyS: 0}, + "AA/GG": {enthalpyH: 0, entropyS: 0}, + "AA/GU": {enthalpyH: 0, entropyS: 0}, + "AA/UA": {enthalpyH: 0, entropyS: 0}, + "AA/UC": {enthalpyH: 0, entropyS: 0}, + "AA/UG": {enthalpyH: 0, entropyS: 0}, + "AC/AA": {enthalpyH: 0, entropyS: 0}, + "AC/AC": {enthalpyH: 0, entropyS: 0}, + "AC/AG": {enthalpyH: 0, entropyS: 0}, + "AC/AU": {enthalpyH: 0, entropyS: 0}, + "AC/CA": {enthalpyH: 0, entropyS: 0}, + "AC/CC": {enthalpyH: 0, entropyS: 0}, + "AC/CG": {enthalpyH: 0, entropyS: 0}, + "AC/CU": {enthalpyH: 0, entropyS: 0}, + "AC/GA": {enthalpyH: 0, entropyS: 0}, + "AC/GC": {enthalpyH: 0, entropyS: 0}, + "AC/GG": {enthalpyH: 0, entropyS: 0}, + "AC/GU": {enthalpyH: 0, entropyS: 0}, + "AC/UA": {enthalpyH: 0, entropyS: 0}, + "AC/UC": {enthalpyH: 0, entropyS: 0}, + "AC/UU": {enthalpyH: 0, entropyS: 0}, + "AG/AA": {enthalpyH: 0, entropyS: 0}, + "AG/AC": {enthalpyH: 0, entropyS: 0}, + "AG/AG": {enthalpyH: 0, entropyS: 0}, + "AG/AU": {enthalpyH: 0, entropyS: 0}, + "AG/CA": {enthalpyH: 0, entropyS: 0}, + "AG/CC": {enthalpyH: 0, entropyS: 0}, + "AG/CG": {enthalpyH: 0, entropyS: 0}, + "AG/CU": {enthalpyH: 0, entropyS: 0}, + "AG/GA": {enthalpyH: 0, entropyS: 0}, + "AG/GC": {enthalpyH: 0, entropyS: 0}, + "AG/GG": {enthalpyH: 0, entropyS: 0}, + "AG/GU": {enthalpyH: 0, entropyS: 0}, + "AG/UA": {enthalpyH: 0, entropyS: 0}, + "AG/UG": {enthalpyH: 0, entropyS: 0}, + "AG/UU": {enthalpyH: -3.2, entropyS: -8.4}, + "AU/AA": {enthalpyH: 0, entropyS: 0}, + "AU/AC": {enthalpyH: 0, entropyS: 0}, + "AU/AG": {enthalpyH: 0, entropyS: 0}, + "AU/AU": {enthalpyH: 0, entropyS: 0}, + "AU/CA": {enthalpyH: 0, entropyS: 0}, + "AU/CC": {enthalpyH: 0, entropyS: 0}, + "AU/CG": {enthalpyH: 0, entropyS: 0}, + "AU/CU": {enthalpyH: 0, entropyS: 0}, + "AU/GA": {enthalpyH: 0, entropyS: 0}, + "AU/GC": {enthalpyH: 0, entropyS: 0}, + "AU/GG": {enthalpyH: 0, entropyS: 0}, + "AU/GU": {enthalpyH: 0, entropyS: 0}, + "AU/UC": {enthalpyH: 0, entropyS: 0}, + "AU/UG": {enthalpyH: -8.8, entropyS: -23.9}, + "AU/UU": {enthalpyH: 0, entropyS: 0}, + "CA/AA": {enthalpyH: 0, entropyS: 0}, + "CA/AC": {enthalpyH: 0, entropyS: 0}, + "CA/AG": {enthalpyH: 0, entropyS: 0}, + "CA/AU": {enthalpyH: 0, entropyS: 0}, + "CA/CA": {enthalpyH: 0, entropyS: 0}, + "CA/CC": {enthalpyH: 0, entropyS: 0}, + "CA/CG": {enthalpyH: 0, entropyS: 0}, + "CA/CU": {enthalpyH: 0, entropyS: 0}, + "CA/GA": {enthalpyH: 0, entropyS: 0}, + "CA/GC": {enthalpyH: 0, entropyS: 0}, + "CA/GG": {enthalpyH: 0, entropyS: 0}, + "CA/UA": {enthalpyH: 0, entropyS: 0}, + "CA/UC": {enthalpyH: 0, entropyS: 0}, + "CA/UG": {enthalpyH: 0, entropyS: 0}, + "CA/UU": {enthalpyH: 0, entropyS: 0}, + "CC/AA": {enthalpyH: 0, entropyS: 0}, + "CC/AC": {enthalpyH: 0, entropyS: 0}, + "CC/AG": {enthalpyH: 0, entropyS: 0}, + "CC/AU": {enthalpyH: 0, entropyS: 0}, + "CC/CA": {enthalpyH: 0, entropyS: 0}, + "CC/CC": {enthalpyH: 0, entropyS: 0}, + "CC/CG": {enthalpyH: 0, entropyS: 0}, + "CC/CU": {enthalpyH: 0, entropyS: 0}, + "CC/GA": {enthalpyH: 0, entropyS: 0}, + "CC/GC": {enthalpyH: 0, entropyS: 0}, + "CC/GU": {enthalpyH: 0, entropyS: 0}, + "CC/UA": {enthalpyH: 0, entropyS: 0}, + "CC/UC": {enthalpyH: 0, entropyS: 0}, + "CC/UG": {enthalpyH: 0, entropyS: 0}, + "CC/UU": {enthalpyH: 0, entropyS: 0}, + "CG/AA": {enthalpyH: 0, entropyS: 0}, + "CG/AC": {enthalpyH: 0, entropyS: 0}, + "CG/AG": {enthalpyH: 0, entropyS: 0}, + "CG/AU": {enthalpyH: 0, entropyS: 0}, + "CG/CA": {enthalpyH: 0, entropyS: 0}, + "CG/CC": {enthalpyH: 0, entropyS: 0}, + "CG/CG": {enthalpyH: 0, entropyS: 0}, + "CG/CU": {enthalpyH: 0, entropyS: 0}, + "CG/GA": {enthalpyH: 0, entropyS: 0}, + "CG/GG": {enthalpyH: 0, entropyS: 0}, + "CG/GU": {enthalpyH: -5.6, entropyS: -13.5}, + "CG/UA": {enthalpyH: 0, entropyS: 0}, + "CG/UC": {enthalpyH: 0, entropyS: 0}, + "CG/UG": {enthalpyH: 0, entropyS: 0}, + "CG/UU": {enthalpyH: 0, entropyS: 0}, + "CU/AA": {enthalpyH: 0, entropyS: 0}, + "CU/AC": {enthalpyH: 0, entropyS: 0}, + "CU/AG": {enthalpyH: 0, entropyS: 0}, + "CU/AU": {enthalpyH: 0, entropyS: 0}, + "CU/CA": {enthalpyH: 0, entropyS: 0}, + "CU/CC": {enthalpyH: 0, entropyS: 0}, + "CU/CG": {enthalpyH: 0, entropyS: 0}, + "CU/CU": {enthalpyH: 0, entropyS: 0}, + "CU/GC": {enthalpyH: 0, entropyS: 0}, + "CU/GG": {enthalpyH: -12.1, entropyS: -32.2}, + "CU/GU": {enthalpyH: 0, entropyS: 0}, + "CU/UA": {enthalpyH: 0, entropyS: 0}, + "CU/UC": {enthalpyH: 0, entropyS: 0}, + "CU/UG": {enthalpyH: 0, entropyS: 0}, + "CU/UU": {enthalpyH: 0, entropyS: 0}, + "GA/AA": {enthalpyH: 0, entropyS: 0}, + "GA/AC": {enthalpyH: 0, entropyS: 0}, + "GA/AG": {enthalpyH: 0, entropyS: 0}, + "GA/AU": {enthalpyH: 0, entropyS: 0}, + "GA/CA": {enthalpyH: 0, entropyS: 0}, + "GA/CC": {enthalpyH: 0, entropyS: 0}, + "GA/CG": {enthalpyH: 0, entropyS: 0}, + "GA/GA": {enthalpyH: 0, entropyS: 0}, + "GA/GC": {enthalpyH: 0, entropyS: 0}, + "GA/GG": {enthalpyH: 0, entropyS: 0}, + "GA/GU": {enthalpyH: 0, entropyS: 0}, + "GA/UA": {enthalpyH: 0, entropyS: 0}, + "GA/UC": {enthalpyH: 0, entropyS: 0}, + "GA/UG": {enthalpyH: 0, entropyS: 0}, + "GA/UU": {enthalpyH: -12.8, entropyS: -37.1}, + "GC/AA": {enthalpyH: 0, entropyS: 0}, + "GC/AC": {enthalpyH: 0, entropyS: 0}, + "GC/AG": {enthalpyH: 0, entropyS: 0}, + "GC/AU": {enthalpyH: 0, entropyS: 0}, + "GC/CA": {enthalpyH: 0, entropyS: 0}, + "GC/CC": {enthalpyH: 0, entropyS: 0}, + "GC/CU": {enthalpyH: 0, entropyS: 0}, + "GC/GA": {enthalpyH: 0, entropyS: 0}, + "GC/GC": {enthalpyH: 0, entropyS: 0}, + "GC/GG": {enthalpyH: 0, entropyS: 0}, + "GC/GU": {enthalpyH: 0, entropyS: 0}, + "GC/UA": {enthalpyH: 0, entropyS: 0}, + "GC/UC": {enthalpyH: 0, entropyS: 0}, + "GC/UG": {enthalpyH: -12.6, entropyS: -32.6}, + "GC/UU": {enthalpyH: 0, entropyS: 0}, + "GG/AA": {enthalpyH: 0, entropyS: 0}, + "GG/AC": {enthalpyH: 0, entropyS: 0}, + "GG/AG": {enthalpyH: 0, entropyS: 0}, + "GG/AU": {enthalpyH: 0, entropyS: 0}, + "GG/CA": {enthalpyH: 0, entropyS: 0}, + "GG/CG": {enthalpyH: 0, entropyS: 0}, + "GG/CU": {enthalpyH: -8.3, entropyS: -21.9}, + "GG/GA": {enthalpyH: 0, entropyS: 0}, + "GG/GC": {enthalpyH: 0, entropyS: 0}, + "GG/GG": {enthalpyH: 0, entropyS: 0}, + "GG/GU": {enthalpyH: 0, entropyS: 0}, + "GG/UA": {enthalpyH: 0, entropyS: 0}, + "GG/UC": {enthalpyH: -12.1, entropyS: -32.2}, + "GG/UG": {enthalpyH: 0, entropyS: 0}, + "GG/UU": {enthalpyH: -13.5, entropyS: -41.9}, + "GU/AA": {enthalpyH: 0, entropyS: 0}, + "GU/AC": {enthalpyH: 0, entropyS: 0}, + "GU/AG": {enthalpyH: 0, entropyS: 0}, + "GU/AU": {enthalpyH: 0, entropyS: 0}, + "GU/CC": {enthalpyH: 0, entropyS: 0}, + "GU/CG": {enthalpyH: -12.6, entropyS: -32.6}, + "GU/CU": {enthalpyH: 0, entropyS: 0}, + "GU/GA": {enthalpyH: 0, entropyS: 0}, + "GU/GC": {enthalpyH: 0, entropyS: 0}, + "GU/GG": {enthalpyH: 0, entropyS: 0}, + "GU/GU": {enthalpyH: 0, entropyS: 0}, + "GU/UA": {enthalpyH: -8.8, entropyS: -23.9}, + "GU/UC": {enthalpyH: 0, entropyS: 0}, + "GU/UG": {enthalpyH: -14.6, entropyS: -51.3}, + "GU/UU": {enthalpyH: 0, entropyS: 0}, + "UA/AA": {enthalpyH: 0, entropyS: 0}, + "UA/AC": {enthalpyH: 0, entropyS: 0}, + "UA/AG": {enthalpyH: 0, entropyS: 0}, + "UA/CA": {enthalpyH: 0, entropyS: 0}, + "UA/CC": {enthalpyH: 0, entropyS: 0}, + "UA/CG": {enthalpyH: 0, entropyS: 0}, + "UA/CU": {enthalpyH: 0, entropyS: 0}, + "UA/GA": {enthalpyH: 0, entropyS: 0}, + "UA/GC": {enthalpyH: 0, entropyS: 0}, + "UA/GG": {enthalpyH: 0, entropyS: 0}, + "UA/GU": {enthalpyH: -7, entropyS: -19.3}, + "UA/UA": {enthalpyH: 0, entropyS: 0}, + "UA/UC": {enthalpyH: 0, entropyS: 0}, + "UA/UG": {enthalpyH: 0, entropyS: 0}, + "UA/UU": {enthalpyH: 0, entropyS: 0}, + "UC/AA": {enthalpyH: 0, entropyS: 0}, + "UC/AC": {enthalpyH: 0, entropyS: 0}, + "UC/AU": {enthalpyH: 0, entropyS: 0}, + "UC/CA": {enthalpyH: 0, entropyS: 0}, + "UC/CC": {enthalpyH: 0, entropyS: 0}, + "UC/CG": {enthalpyH: 0, entropyS: 0}, + "UC/CU": {enthalpyH: 0, entropyS: 0}, + "UC/GA": {enthalpyH: 0, entropyS: 0}, + "UC/GC": {enthalpyH: 0, entropyS: 0}, + "UC/GG": {enthalpyH: -8.3, entropyS: -21.9}, + "UC/GU": {enthalpyH: 0, entropyS: 0}, + "UC/UA": {enthalpyH: 0, entropyS: 0}, + "UC/UC": {enthalpyH: 0, entropyS: 0}, + "UC/UG": {enthalpyH: 0, entropyS: 0}, + "UC/UU": {enthalpyH: 0, entropyS: 0}, + "UG/AA": {enthalpyH: 0, entropyS: 0}, + "UG/AG": {enthalpyH: 0, entropyS: 0}, + "UG/AU": {enthalpyH: -7, entropyS: -19.3}, + "UG/CA": {enthalpyH: 0, entropyS: 0}, + "UG/CC": {enthalpyH: 0, entropyS: 0}, + "UG/CG": {enthalpyH: 0, entropyS: 0}, + "UG/CU": {enthalpyH: 0, entropyS: 0}, + "UG/GA": {enthalpyH: 0, entropyS: 0}, + "UG/GC": {enthalpyH: -5.6, entropyS: -13.5}, + "UG/GG": {enthalpyH: 0, entropyS: 0}, + "UG/GU": {enthalpyH: -9.3, entropyS: -31}, + "UG/UA": {enthalpyH: 0, entropyS: 0}, + "UG/UC": {enthalpyH: 0, entropyS: 0}, + "UG/UG": {enthalpyH: 0, entropyS: 0}, + "UG/UU": {enthalpyH: 0, entropyS: 0}, + "UU/AC": {enthalpyH: 0, entropyS: 0}, + "UU/AG": {enthalpyH: -12.8, entropyS: -37.1}, + "UU/AU": {enthalpyH: 0, entropyS: 0}, + "UU/CA": {enthalpyH: 0, entropyS: 0}, + "UU/CC": {enthalpyH: 0, entropyS: 0}, + "UU/CG": {enthalpyH: 0, entropyS: 0}, + "UU/CU": {enthalpyH: 0, entropyS: 0}, + "UU/GA": {enthalpyH: -3.2, entropyS: -8.4}, + "UU/GC": {enthalpyH: 0, entropyS: 0}, + "UU/GG": {enthalpyH: -13.5, entropyS: -41.9}, + "UU/GU": {enthalpyH: 0, entropyS: 0}, + "UU/UA": {enthalpyH: 0, entropyS: 0}, + "UU/UC": {enthalpyH: 0, entropyS: 0}, + "UU/UG": {enthalpyH: 0, entropyS: 0}, + "UU/UU": {enthalpyH: 0, entropyS: 0}, +} + +var rnaTerminalMismatches = matchingBasepairEnergy{ + "AA/AA": {enthalpyH: 0, entropyS: 0}, + "AA/AC": {enthalpyH: 0, entropyS: 0}, + "AA/AG": {enthalpyH: 0, entropyS: 0}, + "AA/AU": {enthalpyH: 0, entropyS: 0}, + "AA/CA": {enthalpyH: 0, entropyS: 0}, + "AA/CC": {enthalpyH: 0, entropyS: 0}, + "AA/CG": {enthalpyH: 0, entropyS: 0}, + "AA/CU": {enthalpyH: 0, entropyS: 0}, + "AA/GA": {enthalpyH: 0, entropyS: 0}, + "AA/GC": {enthalpyH: 0, entropyS: 0}, + "AA/GG": {enthalpyH: 0, entropyS: 0}, + "AA/GU": {enthalpyH: 0, entropyS: 0}, + "AA/UA": {enthalpyH: -3.9, entropyS: -10}, + "AA/UC": {enthalpyH: 2, entropyS: 9.7}, + "AA/UG": {enthalpyH: -3.5, entropyS: -8.7}, + "AA/UU": {enthalpyH: 2, entropyS: 9.7}, + "AC/AA": {enthalpyH: 0, entropyS: 0}, + "AC/AC": {enthalpyH: 0, entropyS: 0}, + "AC/AG": {enthalpyH: 0, entropyS: 0}, + "AC/AU": {enthalpyH: 0, entropyS: 0}, + "AC/CA": {enthalpyH: 0, entropyS: 0}, + "AC/CC": {enthalpyH: 0, entropyS: 0}, + "AC/CG": {enthalpyH: 0, entropyS: 0}, + "AC/CU": {enthalpyH: 0, entropyS: 0}, + "AC/GA": {enthalpyH: 0, entropyS: 0}, + "AC/GC": {enthalpyH: 0, entropyS: 0}, + "AC/GG": {enthalpyH: 0, entropyS: 0}, + "AC/GU": {enthalpyH: 0, entropyS: 0}, + "AC/UA": {enthalpyH: -2.3, entropyS: -5.5}, + "AC/UC": {enthalpyH: 6, entropyS: 21.6}, + "AC/UG": {enthalpyH: -2.3, entropyS: -5.5}, + "AC/UU": {enthalpyH: -0.3, entropyS: 1.3}, + "AG/AA": {enthalpyH: 0, entropyS: 0}, + "AG/AC": {enthalpyH: 0, entropyS: 0}, + "AG/AG": {enthalpyH: 0, entropyS: 0}, + "AG/AU": {enthalpyH: 0, entropyS: 0}, + "AG/CA": {enthalpyH: 0, entropyS: 0}, + "AG/CC": {enthalpyH: 0, entropyS: 0}, + "AG/CG": {enthalpyH: 0, entropyS: 0}, + "AG/CU": {enthalpyH: 0, entropyS: 0}, + "AG/GA": {enthalpyH: 0, entropyS: 0}, + "AG/GC": {enthalpyH: 0, entropyS: 0}, + "AG/GG": {enthalpyH: 0, entropyS: 0}, + "AG/GU": {enthalpyH: 0, entropyS: 0}, + "AG/UA": {enthalpyH: -3.1, entropyS: -7.4}, + "AG/UC": {enthalpyH: 2, entropyS: 9.7}, + "AG/UG": {enthalpyH: -3.5, entropyS: -8.7}, + "AG/UU": {enthalpyH: 2, entropyS: 9.7}, + "AU/AA": {enthalpyH: 0, entropyS: 0}, + "AU/AC": {enthalpyH: 0, entropyS: 0}, + "AU/AG": {enthalpyH: 0, entropyS: 0}, + "AU/AU": {enthalpyH: 0, entropyS: 0}, + "AU/CA": {enthalpyH: 0, entropyS: 0}, + "AU/CC": {enthalpyH: 0, entropyS: 0}, + "AU/CG": {enthalpyH: 0, entropyS: 0}, + "AU/CU": {enthalpyH: 0, entropyS: 0}, + "AU/GA": {enthalpyH: 0, entropyS: 0}, + "AU/GC": {enthalpyH: 0, entropyS: 0}, + "AU/GG": {enthalpyH: 0, entropyS: 0}, + "AU/GU": {enthalpyH: 0, entropyS: 0}, + "AU/UA": {enthalpyH: -2.3, entropyS: -5.5}, + "AU/UC": {enthalpyH: 4.6, entropyS: 17.4}, + "AU/UG": {enthalpyH: -2.3, entropyS: -5.5}, + "AU/UU": {enthalpyH: -1.7, entropyS: -2.9}, + "CA/AA": {enthalpyH: 0, entropyS: 0}, + "CA/AC": {enthalpyH: 0, entropyS: 0}, + "CA/AG": {enthalpyH: 0, entropyS: 0}, + "CA/AU": {enthalpyH: 0, entropyS: 0}, + "CA/CA": {enthalpyH: 0, entropyS: 0}, + "CA/CC": {enthalpyH: 0, entropyS: 0}, + "CA/CG": {enthalpyH: 0, entropyS: 0}, + "CA/CU": {enthalpyH: 0, entropyS: 0}, + "CA/GA": {enthalpyH: -9.1, entropyS: -24.5}, + "CA/GC": {enthalpyH: -5.6, entropyS: -13.2}, + "CA/GG": {enthalpyH: -5.6, entropyS: -13.5}, + "CA/GU": {enthalpyH: -5.6, entropyS: -13.2}, + "CA/UA": {enthalpyH: 0, entropyS: 0}, + "CA/UC": {enthalpyH: 0, entropyS: 0}, + "CA/UG": {enthalpyH: 0, entropyS: 0}, + "CA/UU": {enthalpyH: 0, entropyS: 0}, + "CC/AA": {enthalpyH: 0, entropyS: 0}, + "CC/AC": {enthalpyH: 0, entropyS: 0}, + "CC/AG": {enthalpyH: 0, entropyS: 0}, + "CC/AU": {enthalpyH: 0, entropyS: 0}, + "CC/CA": {enthalpyH: 0, entropyS: 0}, + "CC/CC": {enthalpyH: 0, entropyS: 0}, + "CC/CG": {enthalpyH: 0, entropyS: 0}, + "CC/CU": {enthalpyH: 0, entropyS: 0}, + "CC/GA": {enthalpyH: -5.7, entropyS: -15.2}, + "CC/GC": {enthalpyH: -3.4, entropyS: -7.4}, + "CC/GG": {enthalpyH: -5.7, entropyS: -15.2}, + "CC/GU": {enthalpyH: -2.7, entropyS: -6.1}, + "CC/UA": {enthalpyH: 0, entropyS: 0}, + "CC/UC": {enthalpyH: 0, entropyS: 0}, + "CC/UG": {enthalpyH: 0, entropyS: 0}, + "CC/UU": {enthalpyH: 0, entropyS: 0}, + "CG/AA": {enthalpyH: 0, entropyS: 0}, + "CG/AC": {enthalpyH: 0, entropyS: 0}, + "CG/AG": {enthalpyH: 0, entropyS: 0}, + "CG/AU": {enthalpyH: 0, entropyS: 0}, + "CG/CA": {enthalpyH: 0, entropyS: 0}, + "CG/CC": {enthalpyH: 0, entropyS: 0}, + "CG/CG": {enthalpyH: 0, entropyS: 0}, + "CG/CU": {enthalpyH: 0, entropyS: 0}, + "CG/GA": {enthalpyH: -8.2, entropyS: -21.9}, + "CG/GC": {enthalpyH: -5.6, entropyS: -13.2}, + "CG/GG": {enthalpyH: -9.2, entropyS: -24.5}, + "CG/GU": {enthalpyH: -5.6, entropyS: -13.2}, + "CG/UA": {enthalpyH: 0, entropyS: 0}, + "CG/UC": {enthalpyH: 0, entropyS: 0}, + "CG/UG": {enthalpyH: 0, entropyS: 0}, + "CG/UU": {enthalpyH: 0, entropyS: 0}, + "CU/AA": {enthalpyH: 0, entropyS: 0}, + "CU/AC": {enthalpyH: 0, entropyS: 0}, + "CU/AG": {enthalpyH: 0, entropyS: 0}, + "CU/AU": {enthalpyH: 0, entropyS: 0}, + "CU/CA": {enthalpyH: 0, entropyS: 0}, + "CU/CC": {enthalpyH: 0, entropyS: 0}, + "CU/CG": {enthalpyH: 0, entropyS: 0}, + "CU/CU": {enthalpyH: 0, entropyS: 0}, + "CU/GA": {enthalpyH: -5.7, entropyS: -15.2}, + "CU/GC": {enthalpyH: -5.3, entropyS: -12.6}, + "CU/GG": {enthalpyH: -5.7, entropyS: -15.2}, + "CU/GU": {enthalpyH: -8.6, entropyS: -23.9}, + "CU/UA": {enthalpyH: 0, entropyS: 0}, + "CU/UC": {enthalpyH: 0, entropyS: 0}, + "CU/UG": {enthalpyH: 0, entropyS: 0}, + "CU/UU": {enthalpyH: 0, entropyS: 0}, + "GA/AA": {enthalpyH: 0, entropyS: 0}, + "GA/AC": {enthalpyH: 0, entropyS: 0}, + "GA/AG": {enthalpyH: 0, entropyS: 0}, + "GA/AU": {enthalpyH: 0, entropyS: 0}, + "GA/CA": {enthalpyH: -5.2, entropyS: -13.2}, + "GA/CC": {enthalpyH: -4, entropyS: -8.1}, + "GA/CG": {enthalpyH: -5.6, entropyS: -13.9}, + "GA/CU": {enthalpyH: -4, entropyS: -8.1}, + "GA/GA": {enthalpyH: 0, entropyS: 0}, + "GA/GC": {enthalpyH: 0, entropyS: 0}, + "GA/GG": {enthalpyH: 0, entropyS: 0}, + "GA/GU": {enthalpyH: 0, entropyS: 0}, + "GA/UA": {enthalpyH: -3.4, entropyS: -10}, + "GA/UC": {enthalpyH: 2, entropyS: 9.7}, + "GA/UG": {enthalpyH: -3.5, entropyS: -8.7}, + "GA/UU": {enthalpyH: 2, entropyS: 9.7}, + "GC/AA": {enthalpyH: 0, entropyS: 0}, + "GC/AC": {enthalpyH: 0, entropyS: 0}, + "GC/AG": {enthalpyH: 0, entropyS: 0}, + "GC/AU": {enthalpyH: 0, entropyS: 0}, + "GC/CA": {enthalpyH: -7.2, entropyS: -19.7}, + "GC/CC": {enthalpyH: 0.5, entropyS: 3.9}, + "GC/CG": {enthalpyH: -7.2, entropyS: -19.7}, + "GC/CU": {enthalpyH: -4.2, entropyS: -11.9}, + "GC/GA": {enthalpyH: 0, entropyS: 0}, + "GC/GC": {enthalpyH: 0, entropyS: 0}, + "GC/GG": {enthalpyH: 0, entropyS: 0}, + "GC/GU": {enthalpyH: 0, entropyS: 0}, + "GC/UA": {enthalpyH: -2.3, entropyS: -5.5}, + "GC/UC": {enthalpyH: 6, entropyS: 21.6}, + "GC/UG": {enthalpyH: -2.3, entropyS: -5.5}, + "GC/UU": {enthalpyH: -0.3, entropyS: 1.3}, + "GG/AA": {enthalpyH: 0, entropyS: 0}, + "GG/AC": {enthalpyH: 0, entropyS: 0}, + "GG/AG": {enthalpyH: 0, entropyS: 0}, + "GG/AU": {enthalpyH: 0, entropyS: 0}, + "GG/CA": {enthalpyH: -7.1, entropyS: -17.7}, + "GG/CC": {enthalpyH: -4, entropyS: -8.1}, + "GG/CG": {enthalpyH: -6.2, entropyS: -15.5}, + "GG/CU": {enthalpyH: -4, entropyS: -8.1}, + "GG/GA": {enthalpyH: 0, entropyS: 0}, + "GG/GC": {enthalpyH: 0, entropyS: 0}, + "GG/GG": {enthalpyH: 0, entropyS: 0}, + "GG/GU": {enthalpyH: 0, entropyS: 0}, + "GG/UA": {enthalpyH: -0.6, entropyS: 0}, + "GG/UC": {enthalpyH: 2, entropyS: 9.7}, + "GG/UG": {enthalpyH: -3.5, entropyS: -8.7}, + "GG/UU": {enthalpyH: 2, entropyS: 9.7}, + "GU/AA": {enthalpyH: 0, entropyS: 0}, + "GU/AC": {enthalpyH: 0, entropyS: 0}, + "GU/AG": {enthalpyH: 0, entropyS: 0}, + "GU/AU": {enthalpyH: 0, entropyS: 0}, + "GU/CA": {enthalpyH: -7.2, entropyS: -19.7}, + "GU/CC": {enthalpyH: -0.3, entropyS: 2.3}, + "GU/CG": {enthalpyH: -7.2, entropyS: -19.7}, + "GU/CU": {enthalpyH: -5, entropyS: -13.9}, + "GU/GA": {enthalpyH: 0, entropyS: 0}, + "GU/GC": {enthalpyH: 0, entropyS: 0}, + "GU/GG": {enthalpyH: 0, entropyS: 0}, + "GU/GU": {enthalpyH: 0, entropyS: 0}, + "GU/UA": {enthalpyH: -2.3, entropyS: -5.5}, + "GU/UC": {enthalpyH: 4.6, entropyS: 17.4}, + "GU/UG": {enthalpyH: -2.3, entropyS: -5.5}, + "GU/UU": {enthalpyH: 1.6, entropyS: 7.1}, + "UA/AA": {enthalpyH: -4, entropyS: -9.7}, + "UA/AC": {enthalpyH: -6.3, entropyS: -17.7}, + "UA/AG": {enthalpyH: -8.9, entropyS: -25.1}, + "UA/AU": {enthalpyH: -6.3, entropyS: -17.7}, + "UA/CA": {enthalpyH: 0, entropyS: 0}, + "UA/CC": {enthalpyH: 0, entropyS: 0}, + "UA/CG": {enthalpyH: 0, entropyS: 0}, + "UA/CU": {enthalpyH: 0, entropyS: 0}, + "UA/GA": {enthalpyH: -4.8, entropyS: -12.3}, + "UA/GC": {enthalpyH: -6.3, entropyS: -17.7}, + "UA/GG": {enthalpyH: -8.9, entropyS: -25.1}, + "UA/GU": {enthalpyH: -6.3, entropyS: -17.7}, + "UA/UA": {enthalpyH: 0, entropyS: 0}, + "UA/UC": {enthalpyH: 0, entropyS: 0}, + "UA/UG": {enthalpyH: 0, entropyS: 0}, + "UA/UU": {enthalpyH: 0, entropyS: 0}, + "UC/AA": {enthalpyH: -4.3, entropyS: -11.6}, + "UC/AC": {enthalpyH: -5.1, entropyS: -14.5}, + "UC/AG": {enthalpyH: -4.3, entropyS: -11.6}, + "UC/AU": {enthalpyH: -1.8, entropyS: -4.2}, + "UC/CA": {enthalpyH: 0, entropyS: 0}, + "UC/CC": {enthalpyH: 0, entropyS: 0}, + "UC/CG": {enthalpyH: 0, entropyS: 0}, + "UC/CU": {enthalpyH: 0, entropyS: 0}, + "UC/GA": {enthalpyH: -4.3, entropyS: -11.6}, + "UC/GC": {enthalpyH: -5.1, entropyS: -14.5}, + "UC/GG": {enthalpyH: -4.3, entropyS: -11.6}, + "UC/GU": {enthalpyH: -1.8, entropyS: -4.2}, + "UC/UA": {enthalpyH: 0, entropyS: 0}, + "UC/UC": {enthalpyH: 0, entropyS: 0}, + "UC/UG": {enthalpyH: 0, entropyS: 0}, + "UC/UU": {enthalpyH: 0, entropyS: 0}, + "UG/AA": {enthalpyH: -3.8, entropyS: -8.7}, + "UG/AC": {enthalpyH: -6.3, entropyS: -17.7}, + "UG/AG": {enthalpyH: -8.9, entropyS: -24.8}, + "UG/AU": {enthalpyH: -6.3, entropyS: -17.7}, + "UG/CA": {enthalpyH: 0, entropyS: 0}, + "UG/CC": {enthalpyH: 0, entropyS: 0}, + "UG/CG": {enthalpyH: 0, entropyS: 0}, + "UG/CU": {enthalpyH: 0, entropyS: 0}, + "UG/GA": {enthalpyH: 3.1, entropyS: 11.6}, + "UG/GC": {enthalpyH: -6.3, entropyS: -17.7}, + "UG/GG": {enthalpyH: -1.5, entropyS: -2.3}, + "UG/GU": {enthalpyH: -6.3, entropyS: -17.7}, + "UG/UA": {enthalpyH: 0, entropyS: 0}, + "UG/UC": {enthalpyH: 0, entropyS: 0}, + "UG/UG": {enthalpyH: 0, entropyS: 0}, + "UG/UU": {enthalpyH: 0, entropyS: 0}, + "UU/AA": {enthalpyH: -4.3, entropyS: -11.6}, + "UU/AC": {enthalpyH: -1.4, entropyS: -2.6}, + "UU/AG": {enthalpyH: -4.3, entropyS: -11.6}, + "UU/AU": {enthalpyH: 1.4, entropyS: 6.1}, + "UU/CA": {enthalpyH: 0, entropyS: 0}, + "UU/CC": {enthalpyH: 0, entropyS: 0}, + "UU/CG": {enthalpyH: 0, entropyS: 0}, + "UU/CU": {enthalpyH: 0, entropyS: 0}, + "UU/GA": {enthalpyH: -4.3, entropyS: -11.6}, + "UU/GC": {enthalpyH: -1.4, entropyS: -2.6}, + "UU/GG": {enthalpyH: -4.3, entropyS: -11.6}, + "UU/GU": {enthalpyH: 1.4, entropyS: 6.1}, + "UU/UA": {enthalpyH: 0, entropyS: 0}, + "UU/UC": {enthalpyH: 0, entropyS: 0}, + "UU/UG": {enthalpyH: 0, entropyS: 0}, + "UU/UU": {enthalpyH: 0, entropyS: 0}, +} + +var rnaDanglingEnds = matchingBasepairEnergy{ + ".A/AA": {enthalpyH: 0, entropyS: 0}, + ".A/AC": {enthalpyH: 0, entropyS: 0}, + ".A/AG": {enthalpyH: 0, entropyS: 0}, + ".A/AU": {enthalpyH: -5.7, entropyS: -16.1}, + ".A/CA": {enthalpyH: 0, entropyS: 0}, + ".A/CC": {enthalpyH: 0, entropyS: 0}, + ".A/CG": {enthalpyH: 0, entropyS: 0}, + ".A/CU": {enthalpyH: -0.7, entropyS: -1.9}, + ".A/GA": {enthalpyH: 0, entropyS: 0}, + ".A/GC": {enthalpyH: 0, entropyS: 0}, + ".A/GG": {enthalpyH: 0, entropyS: 0}, + ".A/GU": {enthalpyH: -5.8, entropyS: -16.4}, + ".A/UA": {enthalpyH: 0, entropyS: 0}, + ".A/UC": {enthalpyH: 0, entropyS: 0}, + ".A/UG": {enthalpyH: 0, entropyS: 0}, + ".A/UU": {enthalpyH: -2.2, entropyS: -6.8}, + ".C/AA": {enthalpyH: 0, entropyS: 0}, + ".C/AC": {enthalpyH: 0, entropyS: 0}, + ".C/AG": {enthalpyH: -7.4, entropyS: -20.3}, + ".C/AU": {enthalpyH: 0, entropyS: 0}, + ".C/CA": {enthalpyH: 0, entropyS: 0}, + ".C/CC": {enthalpyH: 0, entropyS: 0}, + ".C/CG": {enthalpyH: -2.8, entropyS: -7.7}, + ".C/CU": {enthalpyH: 0, entropyS: 0}, + ".C/GA": {enthalpyH: 0, entropyS: 0}, + ".C/GC": {enthalpyH: 0, entropyS: 0}, + ".C/GG": {enthalpyH: -6.4, entropyS: -16.4}, + ".C/GU": {enthalpyH: 0, entropyS: 0}, + ".C/UA": {enthalpyH: 0, entropyS: 0}, + ".C/UC": {enthalpyH: 0, entropyS: 0}, + ".C/UG": {enthalpyH: -3.6, entropyS: -9.7}, + ".C/UU": {enthalpyH: 0, entropyS: 0}, + ".G/AA": {enthalpyH: 0, entropyS: 0}, + ".G/AC": {enthalpyH: -9, entropyS: -23.5}, + ".G/AG": {enthalpyH: 0, entropyS: 0}, + ".G/AU": {enthalpyH: -5.7, entropyS: -16.1}, + ".G/CA": {enthalpyH: 0, entropyS: 0}, + ".G/CC": {enthalpyH: -4.1, entropyS: -10.6}, + ".G/CG": {enthalpyH: 0, entropyS: 0}, + ".G/CU": {enthalpyH: -0.7, entropyS: -1.9}, + ".G/GA": {enthalpyH: 0, entropyS: 0}, + ".G/GC": {enthalpyH: -8.6, entropyS: -22.2}, + ".G/GG": {enthalpyH: 0, entropyS: 0}, + ".G/GU": {enthalpyH: -5.8, entropyS: -16.4}, + ".G/UA": {enthalpyH: 0, entropyS: 0}, + ".G/UC": {enthalpyH: -7.5, entropyS: -20.3}, + ".G/UG": {enthalpyH: 0, entropyS: 0}, + ".G/UU": {enthalpyH: -2.2, entropyS: -6.8}, + ".U/AA": {enthalpyH: -4.9, entropyS: -13.2}, + ".U/AC": {enthalpyH: 0, entropyS: 0}, + ".U/AG": {enthalpyH: -4.9, entropyS: -13.2}, + ".U/AU": {enthalpyH: 0, entropyS: 0}, + ".U/CA": {enthalpyH: -0.9, entropyS: -1.3}, + ".U/CC": {enthalpyH: 0, entropyS: 0}, + ".U/CG": {enthalpyH: -0.9, entropyS: -1.3}, + ".U/CU": {enthalpyH: 0, entropyS: 0}, + ".U/GA": {enthalpyH: -5.5, entropyS: -15.2}, + ".U/GC": {enthalpyH: 0, entropyS: 0}, + ".U/GG": {enthalpyH: -5.5, entropyS: -15.2}, + ".U/GU": {enthalpyH: 0, entropyS: 0}, + ".U/UA": {enthalpyH: -2.3, entropyS: -5.5}, + ".U/UC": {enthalpyH: 0, entropyS: 0}, + ".U/UG": {enthalpyH: -2.3, entropyS: -5.5}, + ".U/UU": {enthalpyH: 0, entropyS: 0}, + "A./AA": {enthalpyH: 0, entropyS: 0}, + "A./AC": {enthalpyH: 0, entropyS: 0}, + "A./AG": {enthalpyH: 0, entropyS: 0}, + "A./AU": {enthalpyH: 0, entropyS: 0}, + "A./CA": {enthalpyH: 0, entropyS: 0}, + "A./CC": {enthalpyH: 0, entropyS: 0}, + "A./CG": {enthalpyH: 0, entropyS: 0}, + "A./CU": {enthalpyH: 0, entropyS: 0}, + "A./GA": {enthalpyH: 0, entropyS: 0}, + "A./GC": {enthalpyH: 0, entropyS: 0}, + "A./GG": {enthalpyH: 0, entropyS: 0}, + "A./GU": {enthalpyH: 0, entropyS: 0}, + "A./UA": {enthalpyH: -0.5, entropyS: -0.6}, + "A./UC": {enthalpyH: 6.9, entropyS: 22.6}, + "A./UG": {enthalpyH: 0.6, entropyS: 2.6}, + "A./UU": {enthalpyH: 0.6, entropyS: 2.6}, + "AA/.A": {enthalpyH: 0, entropyS: 0}, + "AA/.C": {enthalpyH: 0, entropyS: 0}, + "AA/.G": {enthalpyH: 0, entropyS: 0}, + "AA/.U": {enthalpyH: 1.6, entropyS: 6.1}, + "AA/A.": {enthalpyH: 0, entropyS: 0}, + "AA/C.": {enthalpyH: 0, entropyS: 0}, + "AA/G.": {enthalpyH: 0, entropyS: 0}, + "AA/U.": {enthalpyH: -4.9, entropyS: -13.2}, + "AC/.A": {enthalpyH: 0, entropyS: 0}, + "AC/.C": {enthalpyH: 0, entropyS: 0}, + "AC/.G": {enthalpyH: -2.4, entropyS: -6.1}, + "AC/.U": {enthalpyH: 0, entropyS: 0}, + "AC/A.": {enthalpyH: 0, entropyS: 0}, + "AC/C.": {enthalpyH: 0, entropyS: 0}, + "AC/G.": {enthalpyH: 0, entropyS: 0}, + "AC/U.": {enthalpyH: -0.9, entropyS: -1.3}, + "AG/.A": {enthalpyH: 0, entropyS: 0}, + "AG/.C": {enthalpyH: -1.6, entropyS: -4.5}, + "AG/.G": {enthalpyH: 0, entropyS: 0}, + "AG/.U": {enthalpyH: 1.6, entropyS: 6.1}, + "AG/A.": {enthalpyH: 0, entropyS: 0}, + "AG/C.": {enthalpyH: 0, entropyS: 0}, + "AG/G.": {enthalpyH: 0, entropyS: 0}, + "AG/U.": {enthalpyH: -5.5, entropyS: -15.2}, + "AU/.A": {enthalpyH: -0.5, entropyS: -0.6}, + "AU/.C": {enthalpyH: 0, entropyS: 0}, + "AU/.G": {enthalpyH: -0.5, entropyS: -0.6}, + "AU/.U": {enthalpyH: 0, entropyS: 0}, + "AU/A.": {enthalpyH: 0, entropyS: 0}, + "AU/C.": {enthalpyH: 0, entropyS: 0}, + "AU/G.": {enthalpyH: 0, entropyS: 0}, + "AU/U.": {enthalpyH: -2.3, entropyS: -5.5}, + "C./AA": {enthalpyH: 0, entropyS: 0}, + "C./AC": {enthalpyH: 0, entropyS: 0}, + "C./AG": {enthalpyH: 0, entropyS: 0}, + "C./AU": {enthalpyH: 0, entropyS: 0}, + "C./CA": {enthalpyH: 0, entropyS: 0}, + "C./CC": {enthalpyH: 0, entropyS: 0}, + "C./CG": {enthalpyH: 0, entropyS: 0}, + "C./CU": {enthalpyH: 0, entropyS: 0}, + "C./GA": {enthalpyH: -1.6, entropyS: -4.5}, + "C./GC": {enthalpyH: 0.7, entropyS: 3.2}, + "C./GG": {enthalpyH: -4.6, entropyS: -14.8}, + "C./GU": {enthalpyH: -0.4, entropyS: -1.3}, + "C./UA": {enthalpyH: 0, entropyS: 0}, + "C./UC": {enthalpyH: 0, entropyS: 0}, + "C./UG": {enthalpyH: 0, entropyS: 0}, + "C./UU": {enthalpyH: 0, entropyS: 0}, + "CA/.A": {enthalpyH: 0, entropyS: 0}, + "CA/.C": {enthalpyH: 0, entropyS: 0}, + "CA/.G": {enthalpyH: 0, entropyS: 0}, + "CA/.U": {enthalpyH: 2.2, entropyS: 8.1}, + "CA/A.": {enthalpyH: 0, entropyS: 0}, + "CA/C.": {enthalpyH: 0, entropyS: 0}, + "CA/G.": {enthalpyH: -9, entropyS: -23.5}, + "CA/U.": {enthalpyH: 0, entropyS: 0}, + "CC/.A": {enthalpyH: 0, entropyS: 0}, + "CC/.C": {enthalpyH: 0, entropyS: 0}, + "CC/.G": {enthalpyH: 3.3, entropyS: 11.6}, + "CC/.U": {enthalpyH: 0, entropyS: 0}, + "CC/A.": {enthalpyH: 0, entropyS: 0}, + "CC/C.": {enthalpyH: 0, entropyS: 0}, + "CC/G.": {enthalpyH: -4.1, entropyS: -10.6}, + "CC/U.": {enthalpyH: 0, entropyS: 0}, + "CG/.A": {enthalpyH: 0, entropyS: 0}, + "CG/.C": {enthalpyH: 0.7, entropyS: 3.2}, + "CG/.G": {enthalpyH: 0, entropyS: 0}, + "CG/.U": {enthalpyH: 2.2, entropyS: 8.1}, + "CG/A.": {enthalpyH: 0, entropyS: 0}, + "CG/C.": {enthalpyH: 0, entropyS: 0}, + "CG/G.": {enthalpyH: -8.6, entropyS: -22.2}, + "CG/U.": {enthalpyH: 0, entropyS: 0}, + "CU/.A": {enthalpyH: 6.9, entropyS: 22.6}, + "CU/.C": {enthalpyH: 0, entropyS: 0}, + "CU/.G": {enthalpyH: 6.9, entropyS: 22.6}, + "CU/.U": {enthalpyH: 0, entropyS: 0}, + "CU/A.": {enthalpyH: 0, entropyS: 0}, + "CU/C.": {enthalpyH: 0, entropyS: 0}, + "CU/G.": {enthalpyH: -7.5, entropyS: -20.3}, + "CU/U.": {enthalpyH: 0, entropyS: 0}, + "G./AA": {enthalpyH: 0, entropyS: 0}, + "G./AC": {enthalpyH: 0, entropyS: 0}, + "G./AG": {enthalpyH: 0, entropyS: 0}, + "G./AU": {enthalpyH: 0, entropyS: 0}, + "G./CA": {enthalpyH: -2.4, entropyS: -6.1}, + "G./CC": {enthalpyH: 3.3, entropyS: 11.6}, + "G./CG": {enthalpyH: 0.8, entropyS: 3.2}, + "G./CU": {enthalpyH: -1.4, entropyS: -4.2}, + "G./GA": {enthalpyH: 0, entropyS: 0}, + "G./GC": {enthalpyH: 0, entropyS: 0}, + "G./GG": {enthalpyH: 0, entropyS: 0}, + "G./GU": {enthalpyH: 0, entropyS: 0}, + "G./UA": {enthalpyH: -0.5, entropyS: -0.6}, + "G./UC": {enthalpyH: 6.9, entropyS: 22.6}, + "G./UG": {enthalpyH: 0.6, entropyS: 2.6}, + "G./UU": {enthalpyH: 0.6, entropyS: 2.6}, + "GA/.A": {enthalpyH: 0, entropyS: 0}, + "GA/.C": {enthalpyH: 0, entropyS: 0}, + "GA/.G": {enthalpyH: 0, entropyS: 0}, + "GA/.U": {enthalpyH: 0.7, entropyS: 3.5}, + "GA/A.": {enthalpyH: 0, entropyS: 0}, + "GA/C.": {enthalpyH: -7.4, entropyS: -20.3}, + "GA/G.": {enthalpyH: 0, entropyS: 0}, + "GA/U.": {enthalpyH: -4.9, entropyS: -13.2}, + "GC/.A": {enthalpyH: 0, entropyS: 0}, + "GC/.C": {enthalpyH: 0, entropyS: 0}, + "GC/.G": {enthalpyH: 0.8, entropyS: 3.2}, + "GC/.U": {enthalpyH: 0, entropyS: 0}, + "GC/A.": {enthalpyH: 0, entropyS: 0}, + "GC/C.": {enthalpyH: -2.8, entropyS: -7.7}, + "GC/G.": {enthalpyH: 0, entropyS: 0}, + "GC/U.": {enthalpyH: -0.9, entropyS: -1.3}, + "GG/.A": {enthalpyH: 0, entropyS: 0}, + "GG/.C": {enthalpyH: -4.6, entropyS: -14.8}, + "GG/.G": {enthalpyH: 0, entropyS: 0}, + "GG/.U": {enthalpyH: 0.7, entropyS: 3.5}, + "GG/A.": {enthalpyH: 0, entropyS: 0}, + "GG/C.": {enthalpyH: -6.4, entropyS: -16.4}, + "GG/G.": {enthalpyH: 0, entropyS: 0}, + "GG/U.": {enthalpyH: -5.5, entropyS: -15.2}, + "GU/.A": {enthalpyH: 0.6, entropyS: 2.6}, + "GU/.C": {enthalpyH: 0, entropyS: 0}, + "GU/.G": {enthalpyH: 0.6, entropyS: 2.6}, + "GU/.U": {enthalpyH: 0, entropyS: 0}, + "GU/A.": {enthalpyH: 0, entropyS: 0}, + "GU/C.": {enthalpyH: -3.6, entropyS: -9.7}, + "GU/G.": {enthalpyH: 0, entropyS: 0}, + "GU/U.": {enthalpyH: -2.3, entropyS: -5.5}, + "U./AA": {enthalpyH: 1.6, entropyS: 6.1}, + "U./AC": {enthalpyH: 2.2, entropyS: 8.1}, + "U./AG": {enthalpyH: 0.7, entropyS: 3.5}, + "U./AU": {enthalpyH: 3.1, entropyS: 10.6}, + "U./CA": {enthalpyH: 0, entropyS: 0}, + "U./CC": {enthalpyH: 0, entropyS: 0}, + "U./CG": {enthalpyH: 0, entropyS: 0}, + "U./CU": {enthalpyH: 0, entropyS: 0}, + "U./GA": {enthalpyH: 1.6, entropyS: 6.1}, + "U./GC": {enthalpyH: 2.2, entropyS: 8.1}, + "U./GG": {enthalpyH: 0.7, entropyS: 3.5}, + "U./GU": {enthalpyH: 3.1, entropyS: 10.6}, + "U./UA": {enthalpyH: 0, entropyS: 0}, + "U./UC": {enthalpyH: 0, entropyS: 0}, + "U./UG": {enthalpyH: 0, entropyS: 0}, + "U./UU": {enthalpyH: 0, entropyS: 0}, + "UA/.A": {enthalpyH: 0, entropyS: 0}, + "UA/.C": {enthalpyH: 0, entropyS: 0}, + "UA/.G": {enthalpyH: 0, entropyS: 0}, + "UA/.U": {enthalpyH: 3.1, entropyS: 10.6}, + "UA/A.": {enthalpyH: -5.7, entropyS: -16.1}, + "UA/C.": {enthalpyH: 0, entropyS: 0}, + "UA/G.": {enthalpyH: -5.7, entropyS: -16.1}, + "UA/U.": {enthalpyH: 0, entropyS: 0}, + "UC/.A": {enthalpyH: 0, entropyS: 0}, + "UC/.C": {enthalpyH: 0, entropyS: 0}, + "UC/.G": {enthalpyH: -1.4, entropyS: -4.2}, + "UC/.U": {enthalpyH: 0, entropyS: 0}, + "UC/A.": {enthalpyH: -0.7, entropyS: -1.9}, + "UC/C.": {enthalpyH: 0, entropyS: 0}, + "UC/G.": {enthalpyH: -0.7, entropyS: -1.9}, + "UC/U.": {enthalpyH: 0, entropyS: 0}, + "UG/.A": {enthalpyH: 0, entropyS: 0}, + "UG/.C": {enthalpyH: -0.4, entropyS: -1.3}, + "UG/.G": {enthalpyH: 0, entropyS: 0}, + "UG/.U": {enthalpyH: 3.1, entropyS: 10.6}, + "UG/A.": {enthalpyH: -5.8, entropyS: -16.4}, + "UG/C.": {enthalpyH: 0, entropyS: 0}, + "UG/G.": {enthalpyH: -5.8, entropyS: -16.4}, + "UG/U.": {enthalpyH: 0, entropyS: 0}, + "UU/.A": {enthalpyH: 0.6, entropyS: 2.6}, + "UU/.C": {enthalpyH: 0, entropyS: 0}, + "UU/.G": {enthalpyH: 0.6, entropyS: 2.6}, + "UU/.U": {enthalpyH: 0, entropyS: 0}, + "UU/A.": {enthalpyH: -2.2, entropyS: -6.8}, + "UU/C.": {enthalpyH: 0, entropyS: 0}, + "UU/G.": {enthalpyH: -2.2, entropyS: -6.8}, + "UU/U.": {enthalpyH: 0, entropyS: 0}, +} + +var rnaInternalLoops = loopEnergy{ + 1: {0.0, 0.0}, + 2: {0.0, 0.0}, + 3: {0.0, 0.0}, + 4: {-7.2, -26.8}, + 5: {-6.8, -28.4}, + 6: {-1.3, -10.6}, + 7: {-1.3, -11.0}, + 8: {-1.3, -11.6}, + 9: {-1.3, -11.9}, + 10: {-1.3, -12.3}, + 11: {-1.3, -12.6}, + 12: {-1.3, -12.9}, + 13: {-1.3, -13.2}, + 14: {-1.3, -13.5}, + 15: {-1.3, -13.5}, + 16: {-1.3, -13.9}, + 17: {-1.3, -14.2}, + 18: {-1.3, -14.2}, + 19: {-1.3, -14.5}, + 20: {-1.3, -14.8}, + 21: {-1.3, -14.8}, + 22: {-1.3, -15.2}, + 23: {-1.3, -15.2}, + 24: {-1.3, -15.5}, + 25: {-1.3, -15.5}, + 26: {-1.3, -15.5}, + 27: {-1.3, -15.8}, + 28: {-1.3, -15.8}, + 29: {-1.3, -16.1}, + 30: {-1.3, -16.1}, +} + +var rnaBulgeLoops = loopEnergy{ + 1: {10.6, 21.9}, + 2: {7.1, 13.9}, + 3: {7.1, 12.6}, + 4: {7.1, 11.3}, + 5: {7.1, 10.0}, + 6: {7.1, 8.7}, + 7: {7.1, 8.1}, + 8: {7.1, 7.7}, + 9: {7.1, 7.4}, + 10: {7.1, 7.1}, + 11: {7.1, 6.8}, + 12: {7.1, 6.4}, + 13: {7.1, 6.1}, + 14: {7.1, 5.8}, + 15: {7.1, 5.5}, + 16: {7.1, 5.5}, + 17: {7.1, 5.2}, + 18: {7.1, 5.2}, + 19: {7.1, 4.8}, + 20: {7.1, 4.5}, + 21: {7.1, 4.5}, + 22: {7.1, 4.2}, + 23: {7.1, 4.2}, + 24: {7.1, 4.2}, + 25: {7.1, 3.9}, + 26: {7.1, 3.9}, + 27: {7.1, 3.5}, + 28: {7.1, 3.5}, + 29: {7.1, 3.5}, + 30: {7.1, 3.2}, +} + +var rnaHairpinLoops = loopEnergy{ + 1: {0.0, 0.0}, + 2: {0.0, 0.0}, + 3: {1.3, -13.2}, + 4: {4.8, -2.6}, + 5: {3.6, -6.8}, + 6: {-2.9, -26.8}, + 7: {1.3, -15.2}, + 8: {-2.9, -27.1}, + 9: {5.0, -4.5}, + 10: {5.0, -4.8}, + 11: {5.0, -5.2}, + 12: {5.0, -5.5}, + 13: {5.0, -5.8}, + 14: {5.0, -6.1}, + 15: {5.0, -6.1}, + 16: {5.0, -6.4}, + 17: {5.0, -6.8}, + 18: {5.0, -6.8}, + 19: {5.0, -7.1}, + 20: {5.0, -7.1}, + 21: {5.0, -7.4}, + 22: {5.0, -7.4}, + 23: {5.0, -7.7}, + 24: {5.0, -7.7}, + 25: {5.0, -8.1}, + 26: {5.0, -8.1}, + 27: {5.0, -8.1}, + 28: {5.0, -8.4}, + 29: {5.0, -8.4}, + 30: {5.0, -8.7}, +} + +var rnaEnergies = energies{ + bulgeLoops: rnaBulgeLoops, + complement: rnaComplement, + danglingEnds: rnaDanglingEnds, + hairpinLoops: rnaHairpinLoops, + multibranch: rnaMultibranch, + internalLoops: rnaInternalLoops, + internalMismatches: rnaInternalMismatches, + nearestNeighbors: rnaNearestNeighbors, + terminalMismatches: rnaTerminalMismatches, + triTetraLoops: nil, +} diff --git a/fold/seqfold.go b/fold/seqfold.go new file mode 100644 index 00000000..35cb6724 --- /dev/null +++ b/fold/seqfold.go @@ -0,0 +1,248 @@ +package fold + +import ( + "fmt" + "math" + "strings" + + "github.com/TimothyStiles/poly/checks" +) + +const ( + // When calculating the paired minimum free energy if the basepair is isolated, + // and the seq large, penalize at 1,600 kcal/mol heuristic for speeding this up + // from https://www.ncbi.nlm.nih.gov/pubmed/10329189 + isolatedBasePairPenalty = 1600 + // maxLenPreCalulated is the length limit of the pre-calculated energies + // for structures, outside of that range the Jacobson-Stockmayer formula + // is used see the jacobsonStockmayer() function. + maxLenPreCalulated = 30 + + // minLenForStruct is the minimum length of a nucletide sequence that can + // create a structure, pentanucleotide sequences form no stable structure + minLenForStruct = 4 + + // loopsAsymmetryPenalty is an energy penalty added for interior loops if + // the left loop size differs from the right loop size + // Formula 12 from SantaLucia, 2004 + loopsAsymmetryPenalty = 0.3 + // closingATPenalty is the energy penalty applied to strucfures closed by + // an AT basepair. + // Formula 8 from SantaLucia, 2004 + closingATPenalty = 0.5 +) + +/* +multibranchEnergies holds the a, b, c, d in a linear multi-branch energy + +change function. + +A - number of helices in the loop +B - number of unpaired nucleotides in the loop +C - coxial stacking in the loop +D - terminal mismatch contributions +E - base composition of the unpaired nucleotides (probably neglible?) +Inferred from: +Supplemental Material: Annu.Rev.Biophs.Biomol.Struct.33:415-40 +doi: 10.1146/annurev.biophys.32.110601.141800 +The Thermodynamics of DNA Structural Motifs, SantaLucia and Hicks, 2004 +*/ +type multibranchEnergies struct { + helicesCount, unpairedCount, coaxialStackCount, terminalMismatchCount float64 +} + +// energy holds two energies, enthaply and entropy +// SantaLucia & Hicks (2004), Annu. Rev. Biophys. Biomol. Struct 33: 415-440 +type energy struct { + // enthalpy + enthalpyH float64 + // entropy + entropyS float64 +} + +// matchingBasepairEnergy is the energy of matching base pairs +type matchingBasepairEnergy map[string]energy + +// loopEnergy is a map[int]Energy where the int is the length of the loop +type loopEnergy map[int]energy + +// complementFunc is function to translate a base in its complement +type complementFunc func(rune) rune + +// energies holds the needed energy maps, BpEnergy and loopEnergy, to compute +// the folding, it also holds the complement map for the kind of sequence, rna +// or DNA +type energies struct { + bulgeLoops loopEnergy + complement complementFunc + danglingEnds matchingBasepairEnergy + hairpinLoops loopEnergy + multibranch multibranchEnergies + internalLoops loopEnergy + internalMismatches matchingBasepairEnergy + nearestNeighbors matchingBasepairEnergy + terminalMismatches matchingBasepairEnergy + triTetraLoops matchingBasepairEnergy +} + +// subsequence represent an interval of bases in the sequence that can contain +// a inward structure. +type subsequence struct { + start, end int +} + +// nucleicAcidStructure is single structure with a free energy, description, and inward children +type nucleicAcidStructure struct { + // description is the description of the nucleicAcidStructure + description string + // inner is list of inner structure represented as intervals in the + // sequence + inner []subsequence + // energy is the energy of the nucleicAcidStructure + energy float64 +} + +// Equal returns true if two nucleicAcidStructures are equal +func (structure nucleicAcidStructure) Equal(other nucleicAcidStructure) bool { + if len(structure.inner) != len(other.inner) { + return false + } + for i, val := range structure.inner { + if val != other.inner[i] { + return false + } + } + return structure.energy == other.energy +} + +// Valid returns true if the NucleicAcidStructure is valid +func (structure nucleicAcidStructure) Valid() bool { + return structure.energy != math.Inf(1) && structure.energy != math.Inf(-1) +} + +// defaultStructure is the default (zero value) nucleic acid structure, it used +// mostly to initialize the caches, see Context +var defaultStructure = nucleicAcidStructure{ + description: "", + energy: math.Inf(-1), +} + +// invalidStructure represent an invalid nucleic acid structure +var invalidStructure = nucleicAcidStructure{ + description: "", + energy: math.Inf(1), +} + +// context holds the energy caches, energy maps, sequence, and temperature +// needed in order to compute the folding energy and structures. +type context struct { + energies energies + seq string + pairedMinimumFreeEnergyV [][]nucleicAcidStructure + unpairedMinimumFreeEnergyW [][]nucleicAcidStructure + temp float64 +} + +// newFoldingContext returns a context ready to use, in case of error +// the returned FoldingContext is empty. +func newFoldingContext(seq string, temp float64) (context, error) { + seq = strings.ToUpper(seq) + + // figure out whether it's DNA or rna, choose energy map + var energyMap energies + switch { + case checks.IsDNA(seq): + energyMap = dnaEnergies + case checks.IsRNA(seq): + energyMap = rnaEnergies + default: + return context{}, fmt.Errorf("the sequence %s is not RNA or DNA", seq) + } + + var ( + sequenceLength = len(seq) + vCache = make([][]nucleicAcidStructure, sequenceLength) + wCache = make([][]nucleicAcidStructure, sequenceLength) + row = make([]nucleicAcidStructure, sequenceLength) + ) + for nucleicAcidIndex := 0; nucleicAcidIndex < sequenceLength; nucleicAcidIndex++ { + row[nucleicAcidIndex] = defaultStructure + } + for j := 0; j < sequenceLength; j++ { + vCache[j] = make([]nucleicAcidStructure, sequenceLength) + copy(vCache[j], row) + + wCache[j] = make([]nucleicAcidStructure, sequenceLength) + copy(wCache[j], row) + } + ret := context{ + energies: energyMap, + seq: seq, + pairedMinimumFreeEnergyV: vCache, + unpairedMinimumFreeEnergyW: wCache, + temp: temp + 273.15, // kelvin + } + + // fill the cache + _, err := unpairedMinimumFreeEnergyW(0, sequenceLength-1, ret) + if err != nil { + return context{}, fmt.Errorf("error filling the caches for the FoldingContext: %w", err) + } + return ret, nil + +} + +// Result holds the resulting structures of the folded s +type Result struct { + structs []nucleicAcidStructure +} + +// DotBracket returns the dot-bracket notation of the secondary nucleic acid +// structure resulting from folding a sequence. +// +// Dot-bracket notation, consisting in a balanced parentheses string composed +// by a three-character alphabet {.,(,)}, that can be unambiguously converted +// in the RNA secondary structure. See example_test.go for a small example. +func (r Result) DotBracket() string { + if len(r.structs) == 0 { + return "" + } + lastStructEnd := 0 + for _, structure := range r.structs { + for _, innerSubsequence := range structure.inner { + if innerSubsequence.end > lastStructEnd { + lastStructEnd = innerSubsequence.end + } + } + } + lastStructEnd += 1 + result := make([]byte, lastStructEnd) + for i := range result { + result[i] = '.' + } + for _, structure := range r.structs { + if len(structure.inner) == 1 { + innerSubsequence := structure.inner[0] + result[innerSubsequence.start] = '(' + result[innerSubsequence.end] = ')' + } + } + return string(result) +} + +// MinimumFreeEnergy return just the delta G of the structures resulting from +// folding a sequence. +// +// Returns the minimum free energy of the folded sequence +func (r Result) MinimumFreeEnergy() float64 { + if len(r.structs) == 0 { + // invalid + return math.Inf(1) + } + + summedEnergy := 0.0 + for _, structure := range r.structs { + summedEnergy += structure.energy + } + return summedEnergy +} diff --git a/fold/utils.go b/fold/utils.go new file mode 100644 index 00000000..aef6d92c --- /dev/null +++ b/fold/utils.go @@ -0,0 +1,19 @@ +package fold + +import ( + "golang.org/x/exp/constraints" +) + +func max[T constraints.Ordered](a, b T) T { + if a > b { + return a + } + return b +} + +func abs(x int) int { + if x < 0 { + return -x + } + return x +} diff --git a/go.mod b/go.mod index f0e7feae..0d403922 100644 --- a/go.mod +++ b/go.mod @@ -3,12 +3,13 @@ module github.com/TimothyStiles/poly go 1.18 require ( - github.com/google/go-cmp v0.5.6 + github.com/google/go-cmp v0.5.8 github.com/lunny/log v0.0.0-20160921050905-7887c61bf0de github.com/mitchellh/go-wordwrap v1.0.1 github.com/mroth/weightedrand v0.4.1 github.com/pmezard/go-difflib v1.0.0 github.com/sergi/go-diff v1.2.0 + golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 lukechampine.com/blake3 v1.1.5 ) @@ -20,6 +21,5 @@ require ( require ( github.com/klauspost/cpuid v1.3.1 // indirect github.com/stretchr/testify v1.4.0 - golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) diff --git a/go.sum b/go.sum index b62d294b..16976009 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ= -github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= +github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s= github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= @@ -25,9 +25,8 @@ github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNX github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 h1:LGJsf5LRplCck6jUCH3dBL2dmycNruWNF5xugkSlfXw= +golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/io/rebase/data/rebase_test.txt b/io/rebase/data/rebase_test.txt index b12cf799..2d311c09 100644 --- a/io/rebase/data/rebase_test.txt +++ b/io/rebase/data/rebase_test.txt @@ -998,4 +998,4 @@ Chertkov, O. et al., (2010) Standards in Genomic Sciences, vol. 2, pp. 280-289. <5>Acidobacterium capsulatum 1421 <6>K. Inagaki <7> -<8>Inagaki, K., Hamafuku, H., Kotani, H., Kishimoto, N., Tano, T., Unpublished observations. +<8>Inagaki, K., Hamafuku, H., Kotani, H., Kishimoto, N., Tano, T., Unpublished observations. \ No newline at end of file diff --git a/random/random.go b/random/random.go index f450a8cc..b0509b6c 100644 --- a/random/random.go +++ b/random/random.go @@ -1,5 +1,5 @@ /* -Package random provides functions to generate random DNA and protein sequences. +Package random provides functions to generate random DNA, RNA, and protein sequences. */ package random @@ -42,15 +42,24 @@ func ProteinSequence(length int, seed int64) (string, error) { // DNASequence returns a random DNA sequence string of a given length and seed. func DNASequence(length int, seed int64) (string, error) { - var nucleicAcidsAlphabet = []rune("ACTG") - alphabetLength := len(nucleicAcidsAlphabet) + return randomNucelotideSequence(length, seed, []rune("ACTG")), nil +} + +// RNASequence returns a random DNA sequence string of a given length and seed. +func RNASequence(length int, seed int64) (string, error) { + + return randomNucelotideSequence(length, seed, []rune("ACUG")), nil +} + +func randomNucelotideSequence(length int, seed int64, alphabet []rune) string { + alphabetLength := len(alphabet) rand.Seed(seed) randomSequence := make([]rune, length) for basepair := range randomSequence { randomIndex := rand.Intn(alphabetLength) - randomSequence[basepair] = nucleicAcidsAlphabet[randomIndex] + randomSequence[basepair] = alphabet[randomIndex] } - return string(randomSequence), nil + return string(randomSequence) } diff --git a/transform/transform.go b/transform/transform.go index 3a5f4ba9..6b5f35f3 100644 --- a/transform/transform.go +++ b/transform/transform.go @@ -88,7 +88,6 @@ var complementTable = [256]byte{ 'R': 'Y', 'S': 'S', 'T': 'A', - 'U': 'A', 'V': 'B', 'W': 'W', 'Y': 'R', @@ -104,8 +103,99 @@ var complementTable = [256]byte{ 'r': 'y', 's': 's', 't': 'a', + 'v': 'b', + 'w': 'w', + 'y': 'r', +} + +// ReverseComplementRNA returns the reversed complement of sequence. +// It is the equivalent of calling +// +// revComplement := Reverse(ComplementRNA(sequence)) +// +// This function expects byte characters in the range a-z and A-Z and +// will not check for non-byte characters, i.e. utf-8 encoding. +func ReverseComplementRNA(sequence string) string { + sequenceLength := len(sequence) + newSequence := make([]byte, sequenceLength) + for index := 0; index < sequenceLength; index++ { + newSequence[index] = complementTableRNA[sequence[sequenceLength-index-1]] + } + // This is how strings.Builder works with the String() method. If Mr. Go says it's safe... + return *(*string)(unsafe.Pointer(&newSequence)) +} + +// ComplementRNA returns the complement of sequence. In [RNA] each nucleotide +// (A, U, C or G) is has a deterministic pair. A is paired with U and +// C is paired with G. The complement of a sequence is formed by exchanging +// these letters for their pair: +// +// 'A' becomes 'U' +// 'U' becomes 'A' +// 'C' becomes 'G' +// 'G' becomes 'C' +// +// This function expects byte characters in the range a-z and A-Z and +// will not check for non-byte characters, i.e. utf-8 encoding. +// +// [RNA]: https://en.wikipedia.org/wiki/RNA +func ComplementRNA(sequence string) string { + sequenceLength := len(sequence) + newSequence := make([]byte, sequenceLength) + for index := 0; index < sequenceLength; index++ { + newSequence[index] = complementTableRNA[sequence[index]] + } + // This is how strings.Builder works with the String() method. If Mr. Go says it's safe... + return *(*string)(unsafe.Pointer(&newSequence)) +} + +// ComplementBaseRNA accepts a RNA base pair and returns its complement base +// pair. See Complement. +// +// This function expects byte characters in the range a-z and A-Z and +// will return a space ' ' (U+0020) for characters that are not matched +// to any known base. This is subject to change. +func ComplementBaseRNA(basePair rune) rune { + got := rune(complementTableRNA[basePair]) + if got == 0 { + return ' ' // invalid sequence returns empty space. + } + return got +} + +// complementTable provides 1:1 mapping between bases and their complements +// see https://www.dnabaser.com/articles/IUPAC%20ambiguity%20codes.html +var complementTableRNA = [256]byte{ + 'A': 'U', + 'B': 'V', + 'C': 'G', + 'D': 'H', + 'G': 'C', + 'H': 'D', + 'K': 'M', + 'M': 'K', + 'N': 'N', + 'R': 'Y', + 'S': 'S', + 'U': 'A', + 'V': 'B', + 'W': 'W', + 'Y': 'R', + 'X': 'X', + 'a': 'u', + 'b': 'v', + 'c': 'g', + 'd': 'h', + 'g': 'c', + 'h': 'd', + 'k': 'm', + 'm': 'k', + 'n': 'n', + 'r': 'y', + 's': 's', 'u': 'a', 'v': 'b', 'w': 'w', 'y': 'r', + 'x': 'x', } diff --git a/transform/transform_test.go b/transform/transform_test.go index 3180765b..9f877f69 100644 --- a/transform/transform_test.go +++ b/transform/transform_test.go @@ -61,14 +61,11 @@ func TestReverseComplement(t *testing.T) { } func TestComplementBase(t *testing.T) { - var letters = [...]byte{'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'g', 'h', 'k', 'm', 'n', 'r', 's', 't', 'u', 'v', 'w', 'y'} + var letters = [...]byte{'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'g', 'h', 'k', 'm', 'n', 'r', 's', 't', 'v', 'w', 'y'} for _, c := range letters { got := ComplementBase(rune(c)) gotI := ComplementBase(got) gotII := ComplementBase(gotI) - if (c == 'U' && got == 'A') || (c == 'u' && got == 'a') { - continue // Edge case: RNA Uracil base-pairs with Adenine. - } if rune(c) != gotI || gotII != got { t.Errorf("complement transform mismatch: %q->%q->%q->%q", c, got, gotI, gotII) } @@ -108,3 +105,62 @@ func BenchmarkReverse(b *testing.B) { } _ = sequence } + +func TestComplementBaseRNA(t *testing.T) { + bases := [...]rune{'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'g', 'h', 'k', 'm', 'n', 'r', 's', 'u', 'v', 'w', 'y'} + for _, c := range bases { + got := ComplementBaseRNA(c) + gotI := ComplementBaseRNA(got) + gotII := ComplementBaseRNA(gotI) + if rune(c) != gotI || gotII != got { + t.Errorf("complement transform mismatch: %q->%q->%q->%q", c, got, gotI, gotII) + } + } +} + +func TestComplementRNANoRandom(t *testing.T) { + t.Skip() + // keys of complementTableRNA plus '!' + sequence := string([]rune{'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'g', 'h', 'k', 'm', 'n', 'r', 's', 'u', 'v', 'w', 'y', '!'}) + + // values of complementTableRNA plus ' ' + complement := string([]rune{'U', 'V', 'G', 'H', 'C', 'D', 'M', 'K', 'N', 'Y', 'S', 'A', 'B', 'W', 'R', 'u', 'v', 'g', 'h', 'c', 'd', 'm', 'k', 'n', 'y', 's', 'a', 'b', 'w', 'r', ' '}) + + result := ComplementRNA(sequence) + if result != complement { + t.Errorf("bad complement: got %q, expect %q", result, complement) + } +} + +func TestComplementRNA(t *testing.T) { + seed := rand.New(rand.NewSource(1)).Int63() + sequence, err := random.RNASequence(20, seed) + if err != nil { + t.Errorf("failed to generate random DNA sequence") + } + for _, testSequence := range []string{sequence} { // loop is unneeded but makes it easier to add more test cases in the future. + complementSequence := ComplementRNA(testSequence) + for index, complement := range complementSequence { + expectedBase := ComplementBaseRNA(rune(testSequence[index])) + if complement != expectedBase { + t.Errorf("bad %q complement: got %q, expect %q", testSequence[index], complement, expectedBase) + } + } + } +} + +func TestReverseComplementRNA(t *testing.T) { + seed := rand.New(rand.NewSource(1)).Int63() + sequence, err := random.RNASequence(20, seed) + if err != nil { + t.Errorf("failed to generate random RNA sequence") + } + evenAndOddLengthSequences := []string{sequence, sequence[:len(sequence)-1]} + for _, testSequence := range evenAndOddLengthSequences { + got := ReverseComplementRNA(testSequence) + expect := Reverse(ComplementRNA(testSequence)) + if got != expect { + t.Errorf("mismatch with individual Reverse and Complement call:\n%q\n%q", got, expect) + } + } +}