Skip to content

Commit

Permalink
internal/testing/htmlcheck: replace cascadia for htmlcheck.NotIn
Browse files Browse the repository at this point in the history
This change reimplements a subset of cascadia's query functionality so
we can replace it for htmlcheck.NotIn. It adds code that parses a
subset of the css selector syntax (currently it's more than we need
for NotIn but not enough for In) and adds a query function to query
the selector in the given html node. Unlike cascadia, our query
function can match the node itself.

Future cls will add support for more selector syntax so we can support
everything htmlcheck.In is used for but we only intend so support just
what we need. I also limit the selector syntax to ascii for
simplicity.

For #61399

Change-Id: Ia03cc8a9ab42ae11d445a650e1ca0a07ee8a391f
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/541437
LUCI-TryBot-Result: Go LUCI <[email protected]>
kokoro-CI: kokoro <[email protected]>
Reviewed-by: Jonathan Amsterdam <[email protected]>
  • Loading branch information
matloob committed Nov 15, 2023
1 parent 545ce2a commit a7a0e8d
Show file tree
Hide file tree
Showing 3 changed files with 469 additions and 4 deletions.
18 changes: 14 additions & 4 deletions internal/testing/htmlcheck/htmlcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func Run(reader io.Reader, checker Checker) error {
//
// A nil Checker is valid and always succeeds.
func In(selector string, checkers ...Checker) Checker {
sel := mustParseSelector(selector)
sel := mustParseCascadiaSelector(selector)
return func(n *html.Node) error {
var m *html.Node
// cascadia.Query does not test against its argument node.
Expand All @@ -64,7 +64,7 @@ func In(selector string, checkers ...Checker) Checker {
func NotIn(selector string) Checker {
sel := mustParseSelector(selector)
return func(n *html.Node) error {
if sel.Match(n) || cascadia.Query(n, sel) != nil {
if query(n, sel) != nil {
return fmt.Errorf("%q matched one or more elements", selector)
}
return nil
Expand All @@ -84,9 +84,9 @@ func check(n *html.Node, Checkers []Checker) error {
return nil
}

// mustParseSelector parses the given CSS selector. An empty string
// mustParseCascadiaSelector parses the given CSS selector. An empty string
// is treated as "*" (match everything).
func mustParseSelector(s string) cascadia.Sel {
func mustParseCascadiaSelector(s string) cascadia.Sel {
if s == "" {
s = "*"
}
Expand All @@ -97,6 +97,16 @@ func mustParseSelector(s string) cascadia.Sel {
return sel
}

// mustParseSelector parses the given CSS selector. An empty string
// is treated as matching everything.
func mustParseSelector(s string) *selector {
sel, err := parse(s)
if err != nil {
panic(fmt.Sprintf("parsing %q: %v", s, err))
}
return sel
}

// HasText returns a Checker that checks whether the given regexp matches the node's text.
// The text of a node n is the concatenated contents of all text nodes in n's subtree.
// HasText panics if the argument doesn't compile.
Expand Down
270 changes: 270 additions & 0 deletions internal/testing/htmlcheck/query.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package htmlcheck

import (
"errors"
"fmt"
"strings"

"golang.org/x/net/html"
)

// A selector represents a parsed css selector that can be used in a query.
// The atoms all match against a given element and next matches against
// children of that element. So, for example "div#id a" parses into a selector that
// has atoms for matching the div and the id and a next that points to another
// selector that has an atom for "a".
type selector struct {
atoms []selectorAtom
next *selector
}

// String returns a string used for debugging test failures.
func (s *selector) String() string {
if s == nil {
return "nil"
}
str := "["
for i, atom := range s.atoms {
str += fmt.Sprintf("%#v", atom)
if i != len(s.atoms)-1 {
str += ","
}
}
str += "]->" + s.next.String()
return str
}

// selectorAtom represents a part of a selector that individually
// matches a single element name, id, class, or attribute value.
type selectorAtom interface {
match(n *html.Node) bool
}

// query returns the first node in n that matches the given selector,
// or nil if there are no nodes matching the selector.
func query(n *html.Node, selector *selector) *html.Node {
allMatch := true
for _, atom := range selector.atoms {
if !atom.match(n) {
allMatch = false
break
}
}
if allMatch {
if selector.next != nil {
if result := queryChildren(n, selector.next); result != nil {
return result
}
} else {
return n
}
}
return queryChildren(n, selector)
}

func queryChildren(n *html.Node, selector *selector) *html.Node {
child := n.FirstChild
for child != nil {
if result := query(child, selector); result != nil {
return result
}
child = child.NextSibling
}
return nil
}

// parse parses the string into a selector. It matches the following
// atoms: element, #id, .class, [attribute="value"]. It allows the atoms
// to be combined where they all need to match (for example, a#id) and
// for nested selectors to be combined with a space.
// For simplicity, the selector must not have any non-ASCII bytes.
func parse(s string) (*selector, error) {
sel := &selector{}
if !isAscii(s) {
return nil, errors.New("non ascii byte found in selector string")
}
for len(s) > 0 {
switch {
case isLetter(s[0]):
ident, rest := consumeIdentifier(s)
sel.atoms = append(sel.atoms, &elementSelector{ident})
s = rest
case s[0] == '.':
ident, rest := consumeIdentifier(s[1:])
if len(ident) == 0 {
return nil, errors.New("no class name after '.'")
}
sel.atoms = append(sel.atoms, &classSelector{ident})
s = rest
case s[0] == '#':
ident, rest := consumeIdentifier(s[1:])
if len(ident) == 0 {
return nil, errors.New("no id name after '#'")
}
sel.atoms = append(sel.atoms, &idSelector{ident})
s = rest
case s[0] == '[':
attributeSelector, rest, err := parseAttributeSelector(s)
if err != nil {
return nil, err
}
sel.atoms = append(sel.atoms, attributeSelector)
s = rest
case s[0] == ' ':
s = strings.TrimLeft(s, " ")
next, err := parse(s)
if err != nil {
return nil, err
}
sel.next = next
return sel, nil
default:
return nil, fmt.Errorf("unexpected character %q in input", s[0])
}
}
return sel, nil
}

// parseAttributeSelector parses an attribute selector of the form [attribute-name="attribute=value"]
func parseAttributeSelector(s string) (*attributeSelector, string, error) {
if s[0] != '[' {
return nil, "", errors.New("expected '[' at beginning of attribute selector")
}
ident, rest := consumeIdentifier(s[1:])
if len(ident) == 0 {
return nil, "", errors.New("expected attribute name after '[' in attribute selector")
}
attributeName := ident
s = rest
if len(s) == 0 || s[0] != '=' {
return nil, "", errors.New("expected '=' after attribute name in attribute selector")
}
s = s[1:]
if len(s) == 0 || s[0] != '"' {
return nil, "", errors.New("expected '\"' after = in attribute selector")
}
s = s[1:]
i := 0
for ; i < len(s) && s[i] != '"'; i++ {
}
attributeValue, s := s[:i], s[i:]
if len(s) == 0 || s[0] != '"' {
return nil, "", errors.New("expected '\"' after attribute value")
}
s = s[1:]
if len(s) == 0 || s[0] != ']' {
return nil, "", errors.New("expected ']' at end of attribute selector")
}
s = s[1:]
return &attributeSelector{attribute: attributeName, value: attributeValue}, s, nil
}

func isLetter(b byte) bool {
return ('a' <= b && b <= 'z') || ('A' <= b && b <= 'Z')
}

func isNumber(b byte) bool {
return ('0' <= b && b <= '9')
}

// consumeIdentifier consumes and returns a identifier at the beginning
// of the given string, and the rest of the string.
func consumeIdentifier(s string) (letters, rest string) {
i := 0
for ; i < len(s); i++ {
// must start with letter or hyphen or underscore
if i == 0 {
if !(isLetter(s[i]) || s[i] == '-' || s[i] == '_') {
break
}
} else {
if !(isLetter(s[i]) || isNumber(s[i]) || s[i] == '-' || s[i] == '_') {
break
}
}
// CSS doesn't allow identifiers to start with two hyphens or a hyphen
// followed by a digit, but we'll allow it.
}
return s[:i], s[i:]
}

func isAscii(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] > 127 {
return false
}
}
return true
}

// elementSelector matches a node that has the given element name.
type elementSelector struct {
name string
}

func (s *elementSelector) match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
return n.Data == s.name
}

type attributeSelector struct {
attribute, value string
}

// attributeSelector matches a node with an attribute that has a given value.
func (s *attributeSelector) match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
for _, attr := range n.Attr {
if attr.Key == s.attribute {
return attr.Val == s.value
}
}
return false
}

// idSelector matches an element that has the given id.
type idSelector struct {
id string
}

func (s *idSelector) match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
for _, attr := range n.Attr {
if attr.Key == "id" {
return attr.Val == s.id
}
}
return false
}

// classSelector matches an element that has the given class set on it.
type classSelector struct {
class string
}

func (s *classSelector) match(n *html.Node) bool {
if n.Type != html.ElementNode {
return false
}
for _, attr := range n.Attr {
if attr.Key == "class" {
for _, f := range strings.Fields(attr.Val) {
if f == s.class {
return true
}
}
break
}
}
return false
}
Loading

0 comments on commit a7a0e8d

Please sign in to comment.