-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspellchebk.go
166 lines (136 loc) · 4.22 KB
/
spellchebk.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
package spellchebk
import (
"fmt"
"sync"
)
// A BKtree is an n-ary BK-tree.
type BKtree struct {
Word string `json:"word"`
Children []*BKtree `json:"children"`
// Distance from parent to this node
Distance int `json:"distance"`
}
// A SearchResult is one of possibly many results found during a search.
type SearchResult struct {
Word string `json:"word"`
Distance int `json:"distance"`
}
// A DistFunc returns an integer of how different two string are.
type DistFunc func(first, second string) (distance int)
// A SpellChecker builds and queries a spell checker to find similar words.
//
// It uses a string distance function to build an internal BK-tree.
// The default distance function is the True Damerau–Levenshtein distance which
// this package implements as TrueDLDistance.
// SpellCheckers are safe to use across threads.
type SpellChecker struct {
mu sync.Mutex
Tree *BKtree
// DistanceFunc will be used to find the distance between two words.
// For every word added or query, this will likely be run several times.
DistanceFunc DistFunc
}
// NewSpellChecker returns a new BKtree with the initial node root.
func NewSpellChecker() *SpellChecker {
return &SpellChecker{
DistanceFunc: TrueDLDistance,
Tree: &BKtree{},
}
}
// Add inserts a word into the spell checker.
// It is safe to be used across threads.
func (s *SpellChecker) Add(input string) (err error) {
if input == "" {
return fmt.Errorf("attempted to add empty string")
}
s.mu.Lock()
defer s.mu.Unlock()
return s.Tree.add(input, s.DistanceFunc)
}
// add recursively traverses a BKtree to find a suitable spot for the given input.
// When it reaches across an empty string, the input will be placed there.
// For that reason (and from a practicallity standpoint), an empty string cannot be added to the tree.
func (b *BKtree) add(input string, df DistFunc) (err error) {
if b.Word == "" {
b.Word = input
return err
}
dist := df(b.Word, input)
if dist == 0 {
// Distance == 0 implies that the word is already in the tree
return fmt.Errorf("word %s already exists in spell checker", input)
}
for i := range b.Children {
if b.Children[i].Distance == dist {
return b.Children[i].add(input, df)
}
}
b.Children = append(b.Children, &BKtree{Word: input, Distance: dist})
return err
}
// Search queries the tree and returns SearchResult structs of matching words.
func (s *SpellChecker) Search(query string, tolerance int) (found []SearchResult) {
return s.Tree.search(query, tolerance, s.DistanceFunc)
}
// search recursively traverses a BKtree.
// It returns a slice of SearchResults that fall within the given tolerance.
func (b *BKtree) search(query string, tolerance int, df DistFunc) (found []SearchResult) {
dist := df(b.Word, query)
if dist <= tolerance {
found = append(found, SearchResult{Word: b.Word, Distance: dist})
}
for i := range b.Children {
if b.Children[i].Distance >= dist-tolerance && b.Children[i].Distance <= dist+tolerance {
found = append(found, b.Children[i].search(query, tolerance, df)...)
}
}
return found
}
// min returns the minimun of two ints.
func min(x, y int) int {
if x < y {
return x
}
return y
}
// TrueDLDistance finds the True Damerau–Levenshtein distance between two words.
// It's similar to the Damerau-Levenshtein distance but it also accounts for adjacent transpositions.
// Importantly, it also needs to know the size of the input alphabet.
func TrueDLDistance(first string, second string) int {
// Size of our alphabet
var da [255]int
d := make([][]int, len(first)+2)
for i := range d {
d[i] = make([]int, len(second)+2)
}
max := len(first) + len(second)
d[0][0] = max
for i := 0; i <= len(first); i++ {
d[i+1][0] = max
d[i+1][1] = i
}
for i := 0; i <= len(second); i++ {
d[0][i+1] = max
d[1][i+1] = i
}
for i := 1; i <= len(first); i++ {
db := 0
for j := 1; j <= len(second); j++ {
k := da[second[j-1]]
l := db
cost := 0
if first[i-1] == second[j-1] {
db = j
} else {
cost = 1
}
sub := d[i][j] + cost
ins := d[i+1][j] + 1
del := d[i][j+1] + 1
trans := d[k][l] + (i - k - 1) + 1 + (j - l - 1)
d[i+1][j+1] = min(sub, min(ins, min(del, trans)))
}
da[first[i-1]] = i
}
return d[len(first)+1][len(second)+1]
}