From c0d06687a52bbc597c5cdd6ef870b33eb8fe6a79 Mon Sep 17 00:00:00 2001 From: Adam Simon Date: Wed, 21 Feb 2024 20:41:26 +0100 Subject: [PATCH] Fix line number tracking for invalid tagged template tokens + minor perf. improvement to template parsing --- src/Acornima/Parser.Expression.cs | 4 -- src/Acornima/Parser.Statement.cs | 6 +-- src/Acornima/Tokenizer.cs | 75 +++++++++++++++++++++---------- 3 files changed, 52 insertions(+), 33 deletions(-) diff --git a/src/Acornima/Parser.Expression.cs b/src/Acornima/Parser.Expression.cs index 571430b..ab1550e 100644 --- a/src/Acornima/Parser.Expression.cs +++ b/src/Acornima/Parser.Expression.cs @@ -478,8 +478,6 @@ private static bool IsLocalVariableAccess(Expression expr) case Identifier: return true; - // Original acornjs implementation doesn't handle the ParenthesizedExpression case. - // TODO: report bug case ParenthesizedExpression parenthesizedExpression: expr = parenthesizedExpression.Expression; continue; @@ -504,8 +502,6 @@ private static bool IsPrivateFieldAccess(Expression expr) expr = chainExpression.Expression; continue; - // Original acornjs implementation doesn't handle the ParenthesizedExpression case. - // TODO: report bug case ParenthesizedExpression parenthesizedExpression: expr = parenthesizedExpression.Expression; continue; diff --git a/src/Acornima/Parser.Statement.cs b/src/Acornima/Parser.Statement.cs index 3cb4e59..673065e 100644 --- a/src/Acornima/Parser.Statement.cs +++ b/src/Acornima/Parser.Statement.cs @@ -59,7 +59,7 @@ private bool IsLet(StatementContext context = StatementContext.Default) // is allowed. However, `let [` is an explicit negative lookahead for // ExpressionStatement, so special-case it first. - if (nextCh is '[' or '\\') // TODO: Acorn comment says '/' - report bug + if (nextCh is '[' or '\\') { return true; } @@ -1965,10 +1965,6 @@ private Expression ParseModuleExportName() private ArrayList ParseDirectivePrologue(bool allowStrictDirective) { - // NOTE: Original acornjs implementation of strict mode detection is fragile and buggy at the moment - // (e.g.: `() => { 'a'[0]; 'use strict'; 00 }` is rejected while valid). - // TODO: report bug - if (_tokenizerOptions._ecmaVersion < EcmaVersion.ES5) { return new ArrayList(); diff --git a/src/Acornima/Tokenizer.cs b/src/Acornima/Tokenizer.cs index 64b56c7..dabfbe2 100644 --- a/src/Acornima/Tokenizer.cs +++ b/src/Acornima/Tokenizer.cs @@ -972,6 +972,7 @@ private bool ReadString(int quote) { // https://github.com/acornjs/acorn/blob/8.11.3/acorn/src/tokenize.js > `pp.readString = function` + Unsafe.SkipInit(out bool normalizeRaw); _legacyOctalPosition = -1; AcquireStringBuilder(out var sb); try @@ -994,7 +995,7 @@ private bool ReadString(int quote) { case '\\': sb.Append(_input, chunkStart, _position - chunkStart); - if (ReadEscapedChar(sb, inTemplate: false) is null) + if (ReadEscapedChar(sb, inTemplate: false, ref normalizeRaw) is null) { return false; } @@ -1071,8 +1072,9 @@ private bool TryReadTemplateToken() _inTemplateElement = true; - var success = ReadTemplateToken(out var invalidTemplate) - && (!invalidTemplate || ReadInvalidTemplateToken()); + var normalizeRaw = false; + var success = ReadTemplateToken(ref normalizeRaw, out var invalidTemplate) + && (!invalidTemplate || ReadInvalidTemplateToken(ref normalizeRaw)); _inTemplateElement = false; @@ -1100,7 +1102,7 @@ private void InvalidStringToken(int pos, string message) Raise(pos, message); } - private bool ReadTemplateToken(out bool invalidTemplate) + private bool ReadTemplateToken(ref bool normalizeRaw, out bool invalidTemplate) { // https://github.com/acornjs/acorn/blob/8.11.3/acorn/src/tokenize.js > `pp.readTmplToken = function` @@ -1138,13 +1140,13 @@ private bool ReadTemplateToken(out bool invalidTemplate) var templateCooked = DeduplicateString(value, ref _stringPool, NonIdentifierDeduplicationThreshold); sb.Clear(); - var templateRaw = DeduplicateString(ReadTemplateRaw(sb), ref _stringPool, NonIdentifierDeduplicationThreshold); + var templateRaw = DeduplicateString(ReadTemplateRaw(sb, normalizeRaw), ref _stringPool, NonIdentifierDeduplicationThreshold); return FinishToken(TokenType.Template, new TemplateValue(templateCooked, templateRaw)); case '\\': sb.Append(_input, chunkStart, _position - chunkStart); - if (ReadEscapedChar(sb, inTemplate: true) is null) + if (ReadEscapedChar(sb, inTemplate: true, ref normalizeRaw) is null) { invalidTemplate = true; return true; @@ -1153,6 +1155,8 @@ private bool ReadTemplateToken(out bool invalidTemplate) break; case '\r': + normalizeRaw = true; + ++_position; sb.Append(_input, chunkStart, _position - chunkStart); sb[sb.Length - 1] = '\n'; @@ -1188,11 +1192,11 @@ private bool ReadTemplateToken(out bool invalidTemplate) } // Reads a template token to search for the end, without validating any escape sequences - private bool ReadInvalidTemplateToken() + private bool ReadInvalidTemplateToken(ref bool normalizeRaw) { // https://github.com/acornjs/acorn/blob/8.11.3/acorn/src/tokenize.js > `pp.readInvalidTemplateToken = function` - for (int ch; (ch = CharCodeAtPosition()) >= 0; _position++) + for (int ch; (ch = CharCodeAtPosition()) >= 0;) { switch (ch) { @@ -1210,45 +1214,67 @@ private bool ReadInvalidTemplateToken() goto case '`'; case '`': - // Original acornjs implementation doesn't normalize line endings in invalid raw strings. - // TODO: report bug - AcquireStringBuilder(out var sb); try { - var templateRaw = DeduplicateString(ReadTemplateRaw(sb), ref _stringPool, NonIdentifierDeduplicationThreshold); + var templateRaw = DeduplicateString(ReadTemplateRaw(sb, normalizeRaw), ref _stringPool, NonIdentifierDeduplicationThreshold); return FinishToken(TokenType.InvalidTemplate, new TemplateValue(null, templateRaw)); } finally { ReleaseStringBuilder(ref sb); } + + case '\r': + normalizeRaw = true; + + if (CharCodeAtPosition(1) == '\n') + { + ++_position; + } + + goto case '\n'; + + case '\n': + case '\u2028' or '\u2029': + ++_position; + ++_currentLine; + _lineStart = _position; + continue; } + + _position++; } return Raise(_start, "Unterminated template"); } - private ReadOnlySpan ReadTemplateRaw(StringBuilder sb) + private ReadOnlySpan ReadTemplateRaw(StringBuilder sb, bool normalizeRaw) { var chunkStart = _start; - for (int index; (index = _input.IndexOf('\r', chunkStart, _position - chunkStart)) >= 0;) + if (normalizeRaw) { - sb.Append(_input, chunkStart, index - chunkStart).Append('\n'); - chunkStart = index + 1; - if (_input.CharCodeAt(index + 1) == '\n') + for (int index; (index = _input.IndexOf('\r', chunkStart, _position - chunkStart)) >= 0;) { - chunkStart++; + sb.Append(_input, chunkStart, index - chunkStart).Append('\n'); + chunkStart = index + 1; + if (_input.CharCodeAt(index + 1) == '\n') + { + chunkStart++; + } } - } - return chunkStart == _start - ? _input.SliceBetween(chunkStart, _position) - : sb.Append(_input, chunkStart, _position - chunkStart).ToString().AsSpan(); + if (chunkStart != _start) + { + return sb.Append(_input, chunkStart, _position - chunkStart).ToString().AsSpan(); + } + } + return _input.SliceBetween(chunkStart, _position); } // Used to read escaped characters - private StringBuilder? ReadEscapedChar(StringBuilder sb, bool inTemplate) + private StringBuilder? ReadEscapedChar(StringBuilder sb, bool inTemplate, ref bool normalizeRaw) { // https://github.com/acornjs/acorn/blob/8.11.3/acorn/src/tokenize.js > `pp.readEscapedChar = function` + ++_position; var ch = CharCodeAtPosition(); ++_position; @@ -1264,6 +1290,8 @@ private ReadOnlySpan ReadTemplateRaw(StringBuilder sb) case 'f': return sb.Append('\f'); case '\r': + normalizeRaw = true; + if (CharCodeAtPosition() == '\n') // '\r\n' { ++_position; @@ -1274,7 +1302,6 @@ private ReadOnlySpan ReadTemplateRaw(StringBuilder sb) case '\n': // Unicode new line characters after \ get removed from output in both // template literals and strings - // TODO: looks like LineStart and CurrentLine update is missing from Acorn - report bug case '\u2028' or '\u2029': ++_currentLine; _lineStart = _position;