-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokenize.go
71 lines (63 loc) · 1.55 KB
/
tokenize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package nlp
import (
"strings"
)
// NGram returns UTF-8 character n-grams created from the given text. This
// function assumes that s only contains valid UTF-8 letters. It returns an
// empty array when n isn't greater than 0.
func NGram(n int, s string) []string {
if n <= 0 {
return []string{}
}
resLen := len(s) - n + 1
if resLen <= 0 { // Too short
return []string{s}
}
res := make([]string, resLen) // This may be too long due to UTF-8.
r := 0
idx := make([]int, n)
idx[n-1] = -1 // for unigram
k := 0
for i := range s {
if idx[n-1] >= 0 {
// This function intentionally avoid using append because it's
// slower than the direct assignment.
res[r] = s[idx[k]:i]
r++
}
idx[k] = i
k = (k + 1) % n
}
if r == 0 { // Because s contains UTF-8, len(s) > n and r == 0 could be true.
return []string{s}
}
res[r] = s[idx[k]:]
return res[:r+1]
}
// WordNGram creates word n-grams from the given array of words. A separator
// sep can be any string.It returns an empty array when n isn't greater than 0.
func WordNGram(n int, words []string, sep string) []string {
if n <= 0 {
return []string{}
}
if len(words) <= n {
return []string{strings.Join(words, sep)}
}
res := make([]string, len(words)-n+1)
for i := 0; i <= len(words)-n; i++ {
res[i] = strings.Join(words[i:i+n], sep)
}
return res
}
// RemoveEmptyWord removes an empty string from an array of strings.
func RemoveEmptyWord(a []string) []string {
i := 0
res := make([]string, len(a))
for _, s := range a {
if len(s) != 0 {
res[i] = s
i++
}
}
return res[:i]
}