Skip to content

Commit

Permalink
Improved support for Unicode code point escapes (\u{...}) in regexes,…
Browse files Browse the repository at this point in the history
… improved number parsing compatibility.
  • Loading branch information
dop251 committed Aug 28, 2024
1 parent 9387d12 commit 7ece585
Show file tree
Hide file tree
Showing 9 changed files with 120 additions and 64 deletions.
4 changes: 2 additions & 2 deletions builtin_regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ func compileRegexp(patternStr, flags string) (p *regexpPattern, err error) {
patternStr = convertRegexpToUtf16(patternStr)
}

re2Str, err1 := parser.TransformRegExp(patternStr, dotAll)
re2Str, err1 := parser.TransformRegExp(patternStr, dotAll, unicode)
if err1 == nil {
re2flags := ""
if multiline {
Expand All @@ -268,7 +268,7 @@ func compileRegexp(patternStr, flags string) (p *regexpPattern, err error) {
err = err1
return
}
wrapper2, err = compileRegexp2(patternStr, multiline, dotAll, ignoreCase)
wrapper2, err = compileRegexp2(patternStr, multiline, dotAll, ignoreCase, unicode)
if err != nil {
err = fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", patternStr, err)
return
Expand Down
5 changes: 0 additions & 5 deletions parser/expression.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,6 @@ func (self *_parser) parseRegExpLiteral() *ast.RegExpLiteral {

literal := self.str[offset:endOffset]

// Unicode CodePoint sequence
if flags == "u" && strings.Contains(pattern, "_") {
self.error(offset, "Invalid Unicode escape sequence")
}

return &ast.RegExpLiteral{
Idx: idx,
Literal: literal,
Expand Down
17 changes: 10 additions & 7 deletions parser/regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ type _RegExp_parser struct {
goRegexp strings.Builder
passOffset int

dotAll bool // Enable dotAll mode
dotAll bool // Enable dotAll mode
unicode bool
}

// TransformRegExp transforms a JavaScript pattern into a Go "regexp" pattern.
Expand All @@ -57,16 +58,17 @@ type _RegExp_parser struct {
//
// If the pattern is invalid (not valid even in JavaScript), then this function
// returns an empty string and a generic error.
func TransformRegExp(pattern string, dotAll bool) (transformed string, err error) {
func TransformRegExp(pattern string, dotAll, unicode bool) (transformed string, err error) {

if pattern == "" {
return "", nil
}

parser := _RegExp_parser{
str: pattern,
length: len(pattern),
dotAll: dotAll,
str: pattern,
length: len(pattern),
dotAll: dotAll,
unicode: unicode,
}
err = parser.parse()
if err != nil {
Expand Down Expand Up @@ -292,7 +294,7 @@ func (self *_RegExp_parser) scanEscape(inClass bool) {

case 'u':
self.read()
if self.chr == '{' {
if self.chr == '{' && self.unicode {
self.read()
length, base = 0, 16
} else {
Expand Down Expand Up @@ -392,7 +394,8 @@ func (self *_RegExp_parser) scanEscape(inClass bool) {
digit := uint32(digitValue(self.chr))
if digit >= base {
// Not a valid digit
goto skip
self.error(true, "Invalid Unicode escape")
return
}
self.read()
}
Expand Down
14 changes: 7 additions & 7 deletions parser/regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ func TestRegExp(t *testing.T) {
{
// err
test := func(input string, expect interface{}) {
_, err := TransformRegExp(input, false)
_, err := TransformRegExp(input, false, false)
_, incompat := err.(RegexpErrorIncompatible)
is(incompat, false)
is(err, expect)
Expand All @@ -33,7 +33,7 @@ func TestRegExp(t *testing.T) {
{
// incompatible
test := func(input string, expectErr interface{}) {
_, err := TransformRegExp(input, false)
_, err := TransformRegExp(input, false, false)
_, incompat := err.(RegexpErrorIncompatible)
is(incompat, true)
is(err, expectErr)
Expand All @@ -54,7 +54,7 @@ func TestRegExp(t *testing.T) {
{
// err
test := func(input string, expect string) {
result, err := TransformRegExp(input, false)
result, err := TransformRegExp(input, false, false)
is(err, nil)
_, incompat := err.(RegexpErrorIncompatible)
is(incompat, false)
Expand Down Expand Up @@ -151,18 +151,18 @@ func TestRegExp(t *testing.T) {

func TestTransformRegExp(t *testing.T) {
tt(t, func() {
pattern, err := TransformRegExp(`\s+abc\s+`, false)
pattern, err := TransformRegExp(`\s+abc\s+`, false, false)
is(err, nil)
is(pattern, `[`+WhitespaceChars+`]+abc[`+WhitespaceChars+`]+`)
is(regexp.MustCompile(pattern).MatchString("\t abc def"), true)
})
tt(t, func() {
pattern, err := TransformRegExp(`\u{1d306}`, false)
pattern, err := TransformRegExp(`\u{1d306}`, false, true)
is(err, nil)
is(pattern, `\x{1d306}`)
})
tt(t, func() {
pattern, err := TransformRegExp(`\u1234`, false)
pattern, err := TransformRegExp(`\u1234`, false, false)
is(err, nil)
is(pattern, `\x{1234}`)
})
Expand All @@ -173,7 +173,7 @@ func BenchmarkTransformRegExp(b *testing.B) {
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, _ = TransformRegExp(reStr, false)
_, _ = TransformRegExp(reStr, false, false)
}
}

Expand Down
7 changes: 5 additions & 2 deletions regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ type regexpPattern struct {
regexp2Wrapper *regexp2Wrapper
}

func compileRegexp2(src string, multiline, dotAll, ignoreCase bool) (*regexp2Wrapper, error) {
func compileRegexp2(src string, multiline, dotAll, ignoreCase, unicode bool) (*regexp2Wrapper, error) {
var opts regexp2.RegexOptions = regexp2.ECMAScript
if multiline {
opts |= regexp2.Multiline
Expand All @@ -78,6 +78,9 @@ func compileRegexp2(src string, multiline, dotAll, ignoreCase bool) (*regexp2Wra
if ignoreCase {
opts |= regexp2.IgnoreCase
}
if unicode {
opts |= regexp2.Unicode
}
regexp2Pattern, err1 := regexp2.Compile(src, opts)
if err1 != nil {
return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
Expand All @@ -90,7 +93,7 @@ func (p *regexpPattern) createRegexp2() {
if p.regexp2Wrapper != nil {
return
}
rx, err := compileRegexp2(p.src, p.multiline, p.dotAll, p.ignoreCase)
rx, err := compileRegexp2(p.src, p.multiline, p.dotAll, p.ignoreCase, p.unicode)
if err != nil {
// At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
panic(err)
Expand Down
26 changes: 26 additions & 0 deletions regexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,32 @@ func TestRegexpDotAll(t *testing.T) {

}

func TestRegexpNumSeparators(t *testing.T) {
const SCRIPT = `
const re = /(?<=a)\u{65}_/u;
assert(re.test("ae_") && !re.test("e_"));
assert.throws(SyntaxError, () => {
new RegExp("(?<=a)\\u{6_5}", "u");
});
assert.throws(SyntaxError, () => {
new RegExp("a\\u{6_5}", "u");
});
`
testScriptWithTestLib(SCRIPT, _undefined, t)
}

func TestRegexpUnicodeEscape(t *testing.T) {
const SCRIPT = `
assert.sameValue("u{0_2}".match(/\u{0_2}/)[0], "u{0_2}");
assert.sameValue("uu\x02".match(/\u{2}/u)[0], '\x02');
assert.sameValue("uu\x02".match(/\u{2}/)[0], "uu");
`
testScriptWithTestLib(SCRIPT, _undefined, t)
}

func BenchmarkRegexpSplitWithBackRef(b *testing.B) {
const SCRIPT = `
"aaaaaaaaaaaaaaaaaaaaaaaaa++bbbbbbbbbbbbbbbbbbbbbb+-ccccccccccccccccccccccc".split(/([+-])\1/)
Expand Down
25 changes: 25 additions & 0 deletions runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2979,6 +2979,31 @@ func TestDestructAssignToSymbol(t *testing.T) {
testScriptWithTestLib(SCRIPT, _undefined, t)
}

func TestToNumber(t *testing.T) {
const SCRIPT = `
assert(isNaN(Number("+")));
assert(isNaN(Number("++")));
assert(isNaN(Number("-")));
assert(isNaN(Number("0xfp1")));
assert(isNaN(Number("0Xfp1")));
assert(isNaN(Number("+0xfp1")));
assert(isNaN(Number(" +0xfp1")));
assert(isNaN(Number(" + 0xfp1")));
assert(isNaN(Number(" 0xfp1")));
assert(isNaN(Number("-0xfp1")));
assert(isNaN(Number("- 0xfp1")));
assert(isNaN(Number(" - 0xfp1")));
assert.sameValue(Number("0."), 0);
assert.sameValue(Number(" "), 0);
assert.sameValue(Number(" Infinity"), Infinity);
let a = [1];
assert.sameValue(1, a.at("0xfp1"));
assert.sameValue(1, a.at(" 0xfp1"));
`
testScriptWithTestLib(SCRIPT, _undefined, t)
}

/*
func TestArrayConcatSparse(t *testing.T) {
function foo(a,b,c)
Expand Down
75 changes: 45 additions & 30 deletions string_ascii.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ func stringToInt(ss string) (int64, error) {
return strconv.ParseInt(ss, 10, 64)
}

func (s asciiString) _toInt() (int64, error) {
return stringToInt(strings.TrimSpace(string(s)))
func (s asciiString) _toInt(trimmed string) (int64, error) {
return stringToInt(trimmed)
}

func isRangeErr(err error) bool {
Expand All @@ -114,18 +114,36 @@ func isRangeErr(err error) bool {
return false
}

func (s asciiString) _toFloat() (float64, error) {
ss := strings.ToLower(strings.TrimSpace(string(s)))
if ss == "" {
func (s asciiString) _toFloat(trimmed string) (float64, error) {
if trimmed == "" {
return 0, nil
}
if ss == "-0" {
if trimmed == "-0" {
var f float64
return -f, nil
}

f, err := strconv.ParseFloat(ss, 64)
// Go allows underscores in numbers, when parsed as floats, but ECMAScript expect them to be interpreted as NaN.
if strings.ContainsRune(trimmed, '_') {
return 0, strconv.ErrSyntax
}

// Hexadecimal floats are not supported by ECMAScript.
if len(trimmed) >= 2 {
var prefix string
if trimmed[0] == '-' || trimmed[0] == '+' {
prefix = trimmed[1:]
} else {
prefix = trimmed
}
if len(prefix) >= 2 && prefix[0] == '0' && (prefix[1] == 'x' || prefix[1] == 'X') {
return 0, strconv.ErrSyntax
}
}

f, err := strconv.ParseFloat(trimmed, 64)
if err == nil && math.IsInf(f, 0) {
ss := strings.ToLower(trimmed)
if strings.HasPrefix(ss, "inf") || strings.HasPrefix(ss, "-inf") || strings.HasPrefix(ss, "+inf") {
// We handle "Infinity" separately, prevent from being parsed as Infinity due to strconv.ParseFloat() permissive syntax
return 0, strconv.ErrSyntax
Expand All @@ -138,18 +156,19 @@ func (s asciiString) _toFloat() (float64, error) {
}

func (s asciiString) ToInteger() int64 {
if s == "" {
ss := strings.TrimSpace(string(s))
if ss == "" {
return 0
}
if s == "Infinity" || s == "+Infinity" {
if ss == "Infinity" || ss == "+Infinity" {
return math.MaxInt64
}
if s == "-Infinity" {
if ss == "-Infinity" {
return math.MinInt64
}
i, err := s._toInt()
i, err := s._toInt(ss)
if err != nil {
f, err := s._toFloat()
f, err := s._toFloat(ss)
if err == nil {
return int64(f)
}
Expand All @@ -170,18 +189,19 @@ func (s asciiString) String() string {
}

func (s asciiString) ToFloat() float64 {
if s == "" {
ss := strings.TrimSpace(string(s))
if ss == "" {
return 0
}
if s == "Infinity" || s == "+Infinity" {
if ss == "Infinity" || ss == "+Infinity" {
return math.Inf(1)
}
if s == "-Infinity" {
if ss == "-Infinity" {
return math.Inf(-1)
}
f, err := s._toFloat()
f, err := s._toFloat(ss)
if err != nil {
i, err := s._toInt()
i, err := s._toInt(ss)
if err == nil {
return float64(i)
}
Expand All @@ -195,27 +215,22 @@ func (s asciiString) ToBoolean() bool {
}

func (s asciiString) ToNumber() Value {
if s == "" {
ss := strings.TrimSpace(string(s))
if ss == "" {
return intToValue(0)
}
if s == "Infinity" || s == "+Infinity" {
if ss == "Infinity" || ss == "+Infinity" {
return _positiveInf
}
if s == "-Infinity" {
if ss == "-Infinity" {
return _negativeInf
}

// Go allows underscores in numbers, when parsed as floats,
// as in s._toFloat(), but JS expect them to be interpreted as NaN.
if strings.Contains(string(s), "_") {
return _NaN
}

if i, err := s._toInt(); err == nil {
if i, err := s._toInt(ss); err == nil {
return intToValue(i)
}

if f, err := s._toFloat(); err == nil {
if f, err := s._toFloat(ss); err == nil {
return floatToValue(f)
}

Expand All @@ -236,7 +251,7 @@ func (s asciiString) Equals(other Value) bool {
}

if o, ok := other.(valueInt); ok {
if o1, e := s._toInt(); e == nil {
if o1, e := s._toInt(strings.TrimSpace(string(s))); e == nil {
return o1 == int64(o)
}
return false
Expand All @@ -247,7 +262,7 @@ func (s asciiString) Equals(other Value) bool {
}

if o, ok := other.(valueBool); ok {
if o1, e := s._toFloat(); e == nil {
if o1, e := s._toFloat(strings.TrimSpace(string(s))); e == nil {
return o1 == o.ToFloat()
}
return false
Expand Down
Loading

0 comments on commit 7ece585

Please sign in to comment.