Skip to content

Commit

Permalink
fix: token representation
Browse files Browse the repository at this point in the history
  • Loading branch information
muktihari committed Jun 16, 2024
1 parent 2d79d59 commit 79f6653
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 148 deletions.
2 changes: 1 addition & 1 deletion testdata/copyright_header.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<!--
<!--
Copyright 2024 Example Licence Authors.
-->
<?xml version="1.0" encoding="UTF-8"?>
14 changes: 14 additions & 0 deletions testdata/dtd.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE note [
<!ENTITY nbsp "&#xA0;">
<!ENTITY writer "Writer: Donald Duck.">
<!ENTITY copyright "Copyright: W3Schools.">
]>

<note>
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
<footer>&writer;&nbsp;&copyright;</footer>
</note>
3 changes: 3 additions & 0 deletions testdata/self_closing.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8"?>
<a />
<b/>
14 changes: 9 additions & 5 deletions token.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,19 @@ func PutToken(t *Token) { pool.Put(t) }
// - <?xml version="1.0" encoding="UTF-8"?>
// - <name attr="value" attr="value">
// - <name attr="value" attr="value">CharData
// - <name attr="value" attr="value"><![CDATA[ CharData ]]>
// - <name attr="value" attr="value"/>
// - </name>
// - <!-- comment -->
// - <![CDATA[ some text ]]>
// - <!-- a comment -->
// - <!DOCTYPE library [
// <!ELEMENT library (book+)>
// <!ELEMENT book (title, author, year)>
// ]>
type Token struct {
Name Name // Name: ProcInst: "?xml", StartElement: "name", EndElement: "/name", Comment: "<!--", CDATA: "<![[CDATA", etc.
Name Name // Name can be a StartElement: "name", a EndElement: "/name" or empty when a tag starts with "<?" or "<!" (except "<![CDATA").
Attrs []Attr // Attrs exist when len(Attrs) > 0.
Data []byte // Data could be a CharData, a comment, a CDATA, etc. Depends on identifier in Name.
SelfClosing bool // True when tag end with "/>"" e.g. <c r="E3" s="1" /> instead of <c r="E3" s="1"></c>
Data []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "<?" or "<!" (except "<![CDATA").
SelfClosing bool // True when a tag ends with "/>" e.g. <c r="E3" s="1" />. Also true when a tag starts with "<?" or "<!" (except "<![CDATA").
}

// IsEndElement checks whether the given token represent an end element (closing tag),
Expand Down
217 changes: 137 additions & 80 deletions tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,26 +99,27 @@ func (t *Tokenizer) Reset(r io.Reader, opts ...Option) {
t.options.autoGrowBufferMaxLimitSize = t.options.readBufferSize
}

switch {
case cap(t.buf) >= t.options.readBufferSize+defaultReadBufferSize:
t.buf = t.buf[:t.options.readBufferSize:cap(t.buf)]
switch size := t.options.readBufferSize; {
case cap(t.buf) >= size+defaultReadBufferSize:
t.buf = t.buf[:size:cap(t.buf)]
default:
t.buf = nil // Directs the Tokenizer to re-alloc.
// Create buffer with additional cap since we need to memmove remaining bytes
t.buf = make([]byte, size, size+defaultReadBufferSize)
}
}

// Token returns either a valid token or an error.
// The returned token is only valid before next
// Token or RawToken method invocation.
func (t *Tokenizer) Token() (Token, error) {
func (t *Tokenizer) Token() (token Token, err error) {
if t.err != nil {
return Token{}, t.err
return token, t.err
}

b, err := t.RawToken()
if err != nil {
if !errors.Is(err, io.EOF) {
return Token{}, err
return token, err
}
t.err = io.EOF
}
Expand All @@ -129,10 +130,18 @@ func (t *Tokenizer) Token() (Token, error) {
if len(b) > 0 {
b = t.consumeTagName(b)
b = t.consumeAttrs(b)
t.token.Data = trim(b)
t.consumeCharData(b)
}

return t.token, nil
token = t.token
if len(token.Attrs) == 0 {
token.Attrs = nil
}
if len(token.Data) == 0 {
token.Data = nil
}

return token, nil
}

// RawToken returns token in its raw bytes. At the end,
Expand All @@ -148,34 +157,92 @@ func (t *Tokenizer) RawToken() (b []byte, err error) {
var off int
for {
if pos >= t.last {
t.memmoveRemainingBytes(off)
pos, off = t.last, 0
off, pos = t.memmoveRemainingBytes(off)
if err = t.manageBuffer(); err != nil {
return nil, err
t.err = err
return t.buf[off:pos], err
}
}
switch t.buf[pos] {
case '<':
off = pos

// Check if tag represents Document Type Definition (DTD)
const prefix, _ = "<!DOCTYPE", "]>"
dtdOff := 0
var k int = 1
for i := pos + 1; ; i++ {
if i >= t.last {
prevLast := t.last
off, i = t.memmoveRemainingBytes(off)
dtdOff = dtdOff - (prevLast - t.last)
if err = t.manageBuffer(); err != nil {
t.err = err
break
}
}
if k < len(prefix) {
if t.buf[i] != prefix[k] {
k = 0
break
}
k++
continue
}
switch t.buf[i] {
case ']':
dtdOff = i
case '>':
if dtdOff == i-1 && t.buf[dtdOff] == ']' {
buf := trim(t.buf[off : i+1 : cap(t.buf)])
t.cur = i + 1
return buf, err
}
}
}
case '>':
loop:
// If next char represent CharData, include it.
// If next char represents CharData, include it.
for i := pos + 1; ; i++ {
if i >= t.last {
t.memmoveRemainingBytes(off)
i, pos, off = t.last, 0, 0
off, i = t.memmoveRemainingBytes(off)
pos = i - 1
if err = t.manageBuffer(); err != nil {
pos = i
t.err = err
break
}
}
if t.buf[i] == '<' {
pos = i
break loop
pos = i - 1
// Might be in the form of <![CDATA[ CharData ]]>
const prefix = "<![CDATA["
var k int = 1
for j := i + 1; ; j++ {
if j >= t.last {
prevLast := t.last
off, j = t.memmoveRemainingBytes(off)
pos = pos - (prevLast - t.last)
if err = t.manageBuffer(); err != nil {
t.err = err
break
}
}
if k < len(prefix) {
if t.buf[j] != prefix[k] {
break
}
k++
continue
}
if t.buf[j] == '>' {
pos = j
break
}
}
break
}
}
buf := trim(t.buf[off:pos:cap(t.buf)])
t.cur = pos
buf := trim(t.buf[off : pos+1 : cap(t.buf)])
t.cur = pos + 1
return buf, err
}
pos++
Expand All @@ -191,76 +258,47 @@ func (t *Tokenizer) clearToken() {
t.token.SelfClosing = false
}

func (t *Tokenizer) memmoveRemainingBytes(off int) {
func (t *Tokenizer) memmoveRemainingBytes(off int) (cur, last int) {
if off == 0 {
return t.cur, t.last
}
n := copy(t.buf, t.buf[off:])
t.buf = t.buf[:n:cap(t.buf)]
t.cur, t.last = 0, n
return t.cur, t.last
}

func (t *Tokenizer) manageBuffer() error {
var start, end int
bufferSize := t.options.readBufferSize
switch {
case t.buf == nil:
// Create buffer with additional cap in case we need to memmove remaining bytes
t.buf = make([]byte, bufferSize, bufferSize+defaultReadBufferSize)
end = bufferSize
case t.last+bufferSize <= cap(t.buf):
// Grow by reslice
t.buf = t.buf[: t.last+bufferSize : cap(t.buf)]
start, end = t.last, t.last+bufferSize
default:
if t.last+bufferSize > t.options.autoGrowBufferMaxLimitSize {
return fmt.Errorf("could not grow buffer, max limit is set to %d: %w",
t.options.autoGrowBufferMaxLimitSize, errAutoGrowBufferExceedMaxLimit)
switch growSize := t.last + t.options.readBufferSize; {
case growSize <= cap(t.buf): // Grow by reslice
t.buf = t.buf[:growSize:cap(t.buf)]
start, end = t.last, growSize
default: // Grow by make new alloc
if growSize > t.options.autoGrowBufferMaxLimitSize {
return fmt.Errorf("could not grow buffer to %d, max limit is set to %d: %w",
growSize, t.options.autoGrowBufferMaxLimitSize, errAutoGrowBufferExceedMaxLimit)
}
// Grow by make new alloc
buf := make([]byte, t.last+bufferSize)
buf := make([]byte, growSize)
n := copy(buf, t.buf)
t.buf = buf
start, end = n, cap(t.buf)
}

n, err := io.ReadAtLeast(t.r, t.buf[start:end], 1)
if err != nil {
return err
}
t.buf = t.buf[: start+n : cap(t.buf)]
t.last = len(t.buf)

return nil
return err
}

// consumeNonTagIdentifier consumes identifier starts with "<!": maybe a comment, maybe a CDATA, etc.
// consumeNonTagIdentifier consumes identifier starts with "<?" or "<!", make it raw data.
func (t *Tokenizer) consumeNonTagIdentifier(b []byte) []byte {
if len(b) < 2 || string(b[:2]) != "<!" {
if len(b) < 2 || (string(b[:2]) != "<?" && string(b[:2]) != "<!") {
return b
}

var start int
for i := range b {
if b[i] == ' ' {
break
}
start++
}

// Identifier <!-- , <![CDATA[ , etc.
t.token.Name.Local = b[:start]
t.token.Name.Full = b[:start]

var end int = len(b)
for i := len(b) - 1; i >= 0; i-- {
switch b[i] {
case '>', '-', ']':
end--
continue
}
break
}

t.token.Data = trim(b[start:end])

t.token.Data = b
t.token.SelfClosing = true
return nil
}

Expand All @@ -275,9 +313,12 @@ func (t *Tokenizer) consumeTagName(b []byte) []byte {
t.token.Name.Space = trim(b[pos:i])
pos = i + 1
case '>', ' ': // e.g. <gpx>, <trkpt lat="-7.1872750" lon="110.3450230">
if b[i] == '>' && b[i-1] == '/' { // In case we encounter <name/>
i--
}
t.token.Name.Local = trim(b[pos:i])
t.token.Name.Full = trim(b[fullpos:i])
return b[i+1:]
return b[i:]
}
}
return b
Expand Down Expand Up @@ -312,19 +353,35 @@ func (t *Tokenizer) consumeAttrs(b []byte) []byte {
pos = i + 1
fullpos = i + 1
}
case '/':
t.token.SelfClosing = true
case '>':
if b[i-1] == '/' {
t.token.SelfClosing = true
}
return b[i+1:]
}
}
return b
}

func (t *Tokenizer) consumeCharData(b []byte) {
const prefix, suffix = "<![CDATA[", "]]>"
b = trimPrefix(b)
if len(b) >= len(prefix) && string(b[:len(prefix)]) == prefix {
b = b[len(prefix):]
}
if end := len(b) - len(suffix); end >= 0 && string(b[end:]) == suffix {
b = b[:end]
}
t.token.Data = trim(b)
}

func trim(b []byte) []byte {
b = trimPrefix(b)
b = trimSuffix(b)
return b
}

func trimPrefix(b []byte) []byte {
var start int
start:
for i := range b {
switch b[i] {
case '\r':
Expand All @@ -334,13 +391,14 @@ start:
case '\n', ' ':
start++
default:
break start
return b[start:]
}
}
b = b[start:]
return b
}

func trimSuffix(b []byte) []byte {
var end int = len(b)
end:
for i := len(b) - 1; i >= 0; i-- {
switch b[i] {
case '\n':
Expand All @@ -351,9 +409,8 @@ end:
case ' ':
end--
default:
break end
return b[:end]
}
}

return b[:end]
return b
}
Loading

0 comments on commit 79f6653

Please sign in to comment.