-
Notifications
You must be signed in to change notification settings - Fork 1
/
lexer.go
92 lines (83 loc) · 2.35 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
package main
// Lexer provides a flexible way of splitting a string into several parts
// by repeatedly chopping off a prefix that matches a string, a function
// or a set of byte values.
//
// The Next* methods chop off and return the matched portion.
//
// The Skip* methods chop off the matched portion and return whether something
// matched.
//
// PeekByte and TestByteSet look at the next byte without chopping it off.
// They are typically used in switch statements, which don't allow variable
// declarations.
type Lexer struct {
rest string
}
// ByteSet is a subset of all 256 possible byte values.
// It is used for matching byte strings efficiently.
//
// It cannot match Unicode code points individually and is therefore
// usually used with ASCII characters.
type ByteSet struct {
bits [256]bool
}
func NewLexer(text string) *Lexer {
return &Lexer{text}
}
// Rest returns the part of the string that has not yet been chopped off.
func (l *Lexer) Rest() string { return l.rest }
// SkipHspace chops off the longest prefix (possibly empty) consisting
// solely of horizontal whitespace, which is the ASCII space (U+0020)
// and tab (U+0009).
func (l *Lexer) SkipHspace() bool {
i := 0
rest := l.rest
for i < len(rest) && (rest[i] == ' ' || rest[i] == '\t') {
i++
}
if i > 0 {
l.rest = rest[i:]
return true
}
return false
}
// NextBytesSet chops off the longest prefix (possibly empty) consisting
// solely of bytes from the given set.
func (l *Lexer) NextBytesSet(bytes *ByteSet) string {
i := 0
rest := l.rest
for i < len(rest) && bytes.Contains(rest[i]) {
i++
}
if i != 0 {
l.rest = rest[i:]
}
return rest[:i]
}
// NewByteSet creates a bit mask out of a string like "0-9A-Za-z_".
// To add an actual hyphen to the bit mask, write it either at the beginning
// or at the end of the string, or directly after a range like "a-z".
//
// The bit mask can be used with Lexer.NextBytesSet.
func NewByteSet(chars string) *ByteSet {
var set ByteSet
i := 0
for i < len(chars) {
switch {
case i+2 < len(chars) && chars[i+1] == '-':
min := uint(chars[i])
max := uint(chars[i+2]) // inclusive
for c := min; c <= max; c++ {
set.bits[c] = true
}
i += 3
default:
set.bits[chars[i]] = true
i++
}
}
return &set
}
// Contains tests whether the byte set contains the given byte.
func (bs *ByteSet) Contains(b byte) bool { return bs.bits[b] }