forked from jf-tech/omniparser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xmlreader.go
229 lines (216 loc) · 8.35 KB
/
xmlreader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
package idr
import (
"encoding/xml"
"fmt"
"io"
"reflect"
"strings"
"github.com/antchfx/xpath"
"github.com/jf-tech/go-corelib/caches"
"golang.org/x/net/html/charset"
)
// XMLStreamReader is a streaming XML to *Node reader.
type XMLStreamReader struct {
d *xml.Decoder
space2prefix map[string]string
xpathExpr, xpathFilterExpr *xpath.Expr
root, cur, stream *Node
err error
}
// streamCandidateCheck checks if sp.cur is a potential stream candidate.
// See more details/explanation in JSONStreamReader.streamCandidateCheck.
func (sp *XMLStreamReader) streamCandidateCheck() {
if sp.xpathExpr != nil && sp.stream == nil && MatchAny(sp.root, sp.xpathExpr) {
sp.stream = sp.cur
}
}
// wrapUpCurAndTargetCheck wraps sp.cur node processing and also checks if the sp.cur is the stream
// candidate and if it is, then does a final check: a stream candidate is the target if:
// - If it has finished processing (sp.cur == sp.stream)
// - Either we don't have a stream filter xpath or the stream filter xpath matches.
func (sp *XMLStreamReader) wrapUpCurAndTargetCheck() *Node {
cur := sp.cur
// No matter what outcome the wrapUpCurAndTargetCheck() is, the current node is done, and
// we need to adjust sp.cur to its parent.
sp.cur = sp.cur.Parent
// Only do stream target check if the finished cur node is the stream candidate
if cur != sp.stream {
return nil
}
if sp.xpathFilterExpr == nil || MatchAny(sp.root, sp.xpathFilterExpr) {
return sp.stream
}
// This means while the sp.stream was marked as stream candidate by the initial
// sp.xpathExpr matching, now we've completed the construction of this node fully and
// discovered sp.xpathFilterExpr can't be satisfied, so this sp.stream isn't a
// stream target. To prevent future mismatch for other stream candidate, we need to
// remove it from Node tree completely. And reset sp.stream.
RemoveAndReleaseTree(sp.stream)
sp.stream = nil
return nil
}
func (sp *XMLStreamReader) updateNamespaces(attrs []xml.Attr) {
// https://www.w3.org/TR/xml-names/#scoping-defaulting
for _, attr := range attrs {
if attr.Name.Local == "xmlns" {
sp.space2prefix[attr.Value] = ""
} else if attr.Name.Space == "xmlns" {
sp.space2prefix[attr.Value] = attr.Name.Local
}
}
}
// addNonTextChild creates an XML node of a given type and put it as a child of sp.cur.
// If the call succeeds, sp.cur will be advanced to the newly created child node.
func (sp *XMLStreamReader) addNonTextChild(ntype NodeType, tok interface{}) error {
var xmlSpecific XMLSpecific
name := tok.(xml.Name)
data := name.Local
// If default namespace is declared at root such as `<root xmlns="uri://blah">`, then
// for nodes such as "<abc>", its xml.Name returned by xml.Decoder would look like this:
// xml.Name.Space = "uri://blah"
// xml.Name.Local = "abc"
// We need to do a look-up to find out what the corresponding namespace prefix is.
// If no namespace whatsoever is declared, then xml.Name.Space returned by xml.Decoder
// is empty. No namespace prefix lookup is needed.
if name.Space != "" {
namespaceURI := name.Space
namespacePrefix, found := sp.space2prefix[namespaceURI]
if !found {
if ntype == AttributeNode && name.Space == "xmlns" {
// When xml.Decoder returns xml.Name for attributes, there are two cases:
// 1. normal attributes: it follows the same namespace prefix/URI rules that
// the returned xml.Name.Space contains the full URI of an attribute namespace,
// if any; and xml.Name.Local contains the actual attribute name.
// 2. namespace declaration attributes: these attributes are usually at the root
// level and look like this:
// <lb0:library xmlns:lb0="uri://xyz">
// ...
// </lb0:library>
// Here we have a namespace declaration attribute, specifying for the rest of the XML
// doc we expect to see namespace prefix 'lb0' mapped to URI 'uri://xyz'. When this
// attribute 'xmlns:lb0' is returned by xml.Decoder, the xml.Name would be:
// xml.Name.Space = "xmlns"
// xml.Name.Local = "lb0"
// This is the exception case where you won't be able to find a prefix for namespace
// "xmlns". As such, we'll keep "xmlns" as prefix and URI as "".
namespaceURI = ""
namespacePrefix = name.Space
} else {
// For others (element nodes), a namespace URI to prefix lookup failure means we have
// an undeclared namespace prefix. It's a fatal error.
return fmt.Errorf("unknown namespace '%s' on %s '%s'", namespaceURI, ntype, data)
}
}
xmlSpecific.NamespaceURI = namespaceURI
xmlSpecific.NamespacePrefix = namespacePrefix
}
child := CreateXMLNode(ntype, data, xmlSpecific)
AddChild(sp.cur, child)
sp.cur = child
return nil
}
// addTextChild creates an XML node of TextNode type and put it as a child of sp.cur.
// Note given we never adds anything below a TextNode, addTextChild does NOT advance
// sp.cur to the newly created child.
func (sp *XMLStreamReader) addTextChild(text string) {
child := CreateXMLNode(TextNode, text, XMLSpecific{})
AddChild(sp.cur, child)
}
func (sp *XMLStreamReader) parse() (*Node, error) {
for {
tok, err := sp.d.Token()
if err != nil {
// including io.EOF
return nil, err
}
switch tok := tok.(type) {
case xml.StartElement:
sp.updateNamespaces(tok.Attr)
err = sp.addNonTextChild(ElementNode, tok.Name)
if err != nil {
return nil, err
}
for _, attr := range tok.Attr {
err = sp.addNonTextChild(AttributeNode, attr.Name)
if err != nil {
return nil, err
}
sp.addTextChild(attr.Value)
// Remember sp.addNonTextChild auto advances sp.cur to the newly added child node
// and sp.addTextChild doesn't. In this case, we're done with attr node and its
// text node creation and there will be nothing more to be added below it, so back off.
sp.cur = sp.cur.Parent
}
sp.streamCandidateCheck()
case xml.EndElement:
ret := sp.wrapUpCurAndTargetCheck()
if ret != nil {
return ret, nil
}
case xml.CharData:
sp.addTextChild(string(tok))
}
}
}
// Read returns a *Node that matches the xpath streaming criteria.
func (sp *XMLStreamReader) Read() (n *Node, err error) {
if sp.err != nil {
return nil, sp.err
}
// Because this is a streaming read, we need to remove the last
// stream node from the node tree to free up memory. If Release()
// is called after Read() call, then sp.stream is already cleaned up;
// adding this piece of code here just in case Release() isn't called.
if sp.stream != nil {
RemoveAndReleaseTree(sp.stream)
sp.stream = nil
}
n, sp.err = sp.parse()
return n, sp.err
}
// Release releases the *Node (and its subtree) that Read() has previously
// returned. Note even if Release is not explicitly called, next Read() call
// will still release the current streaming target node.
func (sp *XMLStreamReader) Release(n *Node) {
if n == sp.stream {
sp.stream = nil
}
RemoveAndReleaseTree(n)
}
// AtLine returns the **rough** line number of the current XML decoder.
func (sp *XMLStreamReader) AtLine() int {
// Given all the libraries are of fixed versions in go modules, we're fine.
// If in the future, something changes and breaks due to library upgrade,
// we'll have test failures to remind us to fix.
return int(reflect.ValueOf(sp.d).Elem().FieldByName("line").Int())
}
// NewXMLStreamReader creates a new instance of XML streaming reader.
func NewXMLStreamReader(r io.Reader, xpathStr string) (*XMLStreamReader, error) {
xpathStr = strings.TrimSpace(xpathStr)
xpathNoFilterStr := removeLastFilterInXPath(xpathStr)
xpathExpr, err := caches.GetXPathExpr(xpathStr)
if err != nil {
return nil, fmt.Errorf("invalid xpath '%s', err: %s", xpathStr, err.Error())
}
// If the original xpath is valid, then this xpath with last filter removed gotta
// be valid as well. So no error checking.
xpathNoFilterExpr, _ := caches.GetXPathExpr(xpathNoFilterStr)
reader := &XMLStreamReader{
d: xml.NewDecoder(r),
// http://www.w3.org/XML/1998/namespace is bound by definition to the prefix xml.
space2prefix: map[string]string{
"http://www.w3.org/XML/1998/namespace": "xml",
},
xpathExpr: xpathNoFilterExpr,
xpathFilterExpr: func() *xpath.Expr {
if xpathStr == xpathNoFilterStr {
return nil
}
return xpathExpr
}(),
root: CreateXMLNode(DocumentNode, "", XMLSpecific{}),
}
reader.d.CharsetReader = charset.NewReaderLabel
reader.cur = reader.root
return reader, nil
}