Merge pull request #43 from DiscreteTom/next

v0.15.0
DiscreteTom · Dec 30, 2023 · 94595ac · 94595ac
2 parents 9382110 + ce91f84
commit 94595ac
Show file tree

Hide file tree

Showing 94 changed files with 9,016 additions and 1,524 deletions.
diff --git a/.eslintrc.json b/.eslintrc.json
@@ -8,7 +8,7 @@
     "no-constant-condition": ["error", { "checkLoops": false }],
     "@typescript-eslint/no-unused-vars": [
       "error",
-      { "varsIgnorePattern": "^_$", "argsIgnorePattern": "^_" }
+      { "varsIgnorePattern": "^_+$", "argsIgnorePattern": "^_" }
     ],
     "eqeqeq": "warn"
   }

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -11,7 +11,8 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-node@v3
         with:
-          node-version: 18
+          # v20.0.0 is needed for some ecma features like the `v` flag in regex when testing
+          node-version: 20
 
       - name: Install Dependencies
         run: yarn

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,3 @@
 {
-  "cSpell.words": ["anothers", "traversers", "Whitespaces"]
+  "cSpell.words": ["anothers", "subaction", "traversers", "Whitespaces"]
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,30 @@
 # CHANGELOG
 
+## v0.15.0
+
+- Lexer
+  - **_Breaking Change_**: remove `IReadonlyLexerCore.actions`, add `IReadonlyLexerCore.getTokenKinds`.
+  - **_Breaking Change_**: remove `Builder.getTokenKinds`.
+  - **_Breaking Change_**: remove `Lexer.fromTo`.
+  - **_Breaking Change_**: rewrite `stringLiteral/numericLiteral/comment`.
+    - Add `StringLiteralOptions/StringLiteralData/EscapeHandler`.
+  - **_Breaking Change_**: rewrite `javascript.regexLiteral`.
+  - **_Breaking Change_**: `Lexer.take/takeUntil` will reset lexer's action state by default. [#42](https://github.com/DiscreteTom/retsac/issues/42)
+  - Feat: allow multiple calls for `Builder.state` and `Builder.error`.
+  - Feat: add `ITrimmedLexer/IReadonlyTrimmedLexer`. [#37](https://github.com/DiscreteTom/retsac/issues/37)
+  - Feat: `Builder.append` can accept actions with different data bindings in one call.
+  - Feat: add `Lexer.javascript.evalStringContent/simpleStringLiteral/singleQuoteStringLiteral/doubleQuoteStringLiteral/templateStringLiteralLeft/templateStringLiteralRight`.
+  - Feat: add `Lexer.integerLiteral/binaryIntegerLiteral/octalIntegerLiteral/hexIntegerLiteral` and `IntegerLiteralOptions/IntegerLiteralData`.
+    - Add `Lexer.javascript.binaryIntegerLiteral/octalIntegerLiteral/hexIntegerLiteral/identifier`.
+  - Feat: add `Lexer.json`.
+  - Feat: add `Lexer.SubAction`, `Lexer.IntoSubAction`.
+  - Feat: add `Lexer.Action.prevent`.
+  - Feat: add `Lexer.invalidRejecter`.
+  - Fix: `Lexer.trimStart` shouldn't collect non-muted errors.
+- Parser
+  - **_Breaking Change_**: lexer will be trimmed after every parse. This should improve the performance.
+    - `GrammarRuleContext.lexer` will be `IReadonlyTrimmedLexer` instead of `IReadonlyLexer`.
+
 ## v0.14.0
 
 - Lexer

diff --git a/README.md b/README.md
@@ -6,13 +6,15 @@
 ![license](https://img.shields.io/github/license/DiscreteTom/retsac?style=flat-square)
 [![Visual Studio Marketplace Version](https://img.shields.io/visual-studio-marketplace/v/DiscreteTom.vscode-retsac?label=VSCode%20extension&style=flat-square)](https://marketplace.visualstudio.com/items?itemName=DiscreteTom.vscode-retsac)
 
-> **Warning**
+> [!WARNING]
 > This project is still in early development stage, the API may change frequently.
 
 Text lexer and parser. Compiler frontend framework.
 
 This can be used to **_fast prototype_** your own programming language compiler/translator frontend, or parse your domain specific language.
 
+Try it online in the [playground](https://dttk.discretetom.com/js-playground?crushed=%28%27XpenXncieV%27https%253A%252F%252Fcdn.jsXlivr.net%252Fnpm%252FN%25400.15.0%252Fdist%252FN.min.js%27%255D%7EcellVHPYpaY9ULJ%252C%2520ELRI6NOtrue%7Eid%210%29%252CHWrite%2520the%2520PKr9lJZLJ.QXfine%257BUaM%252F123%252F_q%253B--GUpKrIZELR.AdvancedQlJ%257BlJ*XfineD%255C%27a%255C%27_D%2522entry%2522%252C%2520checkAllMtrueI%257D%253BC4418%29%252CHPK9Ys6pKr.pKAll%257B%2522123%2522%257D-Groot6Ys.buffer%255B0%255D--console.log%257Broot.toTYeStringq%257DC5544%29%255D%7EpanelVF5544%252CF4418%255D%29*%257D-zz.-%255Cr%255Cn6%2520%253D%25209%27%7EcoX%21%27GCOfalse%7Eid%21FD%257BUentryMF170372543Gconst%2520H%28%27name%21%27I%2520%29JexerKarseM%253A%2520NYtsacO%27%7EYadonly%21QBuilXr%257B*U%28%2520Vs%21%255BXdeYreZ6new%2520_I*buildq%257B%257Dz%2520%2520%2501zq_ZYXVUQONMKJIHGFDC96-*_).
+
 ## Installation
 
 ```bash
@@ -21,29 +23,33 @@ yarn add retsac
 
 ## Features
 
-- The Lexer, turns a text string to a [token](https://github.com/DiscreteTom/retsac/blob/main/src/lexer/model.ts) list.
-  - Regex support. See [examples](https://github.com/DiscreteTom/retsac#examples) below.
+- The Lexer, yield [token](https://github.com/DiscreteTom/retsac/blob/main/src/lexer/model.ts) from the text input string.
+  - Regex support. See [examples](#examples) below.
   - [Built-in util functions](https://github.com/DiscreteTom/retsac/blob/main/src/lexer/utils).
-  - Support custom functions to yield tokens from the input string.
+    - JavaScript's string literal, numeric literal, integer literal, identifier, etc.
+    - JSON's string literal, numeric literal.
+  - Support custom functions.
 - The Parser, co-work with the lexer and produce an [AST (Abstract Syntax Tree)](https://github.com/DiscreteTom/retsac/blob/main/src/parser/ast.ts).
   - ELR(Expectational LR) parser.
     - **_Meta characters_** like `+*?` when defining a grammar rule.
     - **_Conflict detection_**, try to **_auto resolve conflicts_**.
-    - Query children nodes by using `$('name')` instead of `children[0]`.
+    - Query children nodes by using `$('name')` instead of `children[index]`.
     - Top-down traverse the AST.
     - Bottom-up reduce data.
     - Expect lexer to yield specific token type and/or content.
     - Try to **_re-lex_** the input if parsing failed.
-    - **_DFA serialization_** to accelerate future building.
-  - Serializable AST to co-work with other tools (e.g. compiler backend libs).
+    - **_DFA serialization & hydration_** to accelerate future building.
+  - Serializable AST to co-work with other tools (e.g. compiler backend libs like LLVM).
 - Strict type checking with TypeScript.
-  - Including string literal type checking for token kinds and grammar kinds.
+  - _This is amazing, you'd better try this out by yourself._
 
 ## Resources
 
-- [Documentation & API reference. (WIP)](https://discretetom.github.io/retsac/)
-- [VSCode extension.](https://github.com/DiscreteTom/vscode-retsac)
-- [Demo programming language which compiles to WebAssembly.](https://github.com/DiscreteTom/dt0)
+- [Documentation & API reference. (Deprecated. Working on a new one.)](https://discretetom.github.io/retsac/)
+- [A demo programming language which compiles to WebAssembly.](https://github.com/DiscreteTom/dt0)
+- [Build tmLanguage.json file in TypeScript with `tmlb`.](https://github.com/DiscreteTom/tmlb)
+- [Compose `RegExp` in JavaScript in a readable and maintainable way with `r-compose`.](https://github.com/DiscreteTom/r-compose)
+<!-- - [VSCode extension.](https://github.com/DiscreteTom/vscode-retsac) -->
 
 ## [Examples](https://github.com/DiscreteTom/retsac/tree/main/examples)
 
@@ -59,34 +65,41 @@ All conflicts are auto resolved.
 const lexer = new Lexer.Builder()
   .ignore(Lexer.whitespaces()) // ignore blank characters
   .define({
-    string: Lexer.stringLiteral(`"`), // double quote string literal
-    number: /-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?/,
+    // built-in support for JSON
+    string: Lexer.json.stringLiteral(),
+    number: Lexer.json.numericLiteral(),
   })
-  .define(Lexer.wordKind("true", "false", "null")) // type's name is the literal value
-  .anonymous(Lexer.exact(..."[]{},:")) // single char borders
+  .define(Lexer.wordKind("true", "false", "null")) // token's kind name equals to the literal value
+  .anonymous(Lexer.exact(..."[]{},:")) // single char borders without a kind name
   .build();
 
 export const builder = new ELR.AdvancedBuilder()
   .lexer(lexer)
   .data<unknown>()
   .define(
     { value: `string | number | true | false | null` },
-    // for string use `eval` to process escaped characters like `\n`
+    // eval the only child's text to get the value
     (d) => d.traverser(({ children }) => eval(children[0].text!)),
   )
-  .define({ value: `object | array` }, (d) =>
-    d.traverser(({ children }) => children[0].traverse()),
+  .define(
+    { value: `object | array` },
+    // call the only child's traverse method to get the object/array value
+    (d) => d.traverser(({ children }) => children[0].traverse()),
   )
   .define(
+    // `?` for zero or one, `*` for zero or more, use `()` to group
+    // quote literal values with `'` or `"`
     { array: `'[' (value (',' value)*)? ']'` },
-    // use `$$` to select all children with the given kind
+    // use `$$` to select all children with the given name
+    // traverse all values in the array and return the result as an array
     (d) => d.traverser(({ $$ }) => $$(`value`).map((v) => v.traverse())),
   )
   .define({ object: `'{' (object_item (',' object_item)*)? '}'` }, (d) =>
     d.traverser(({ $$ }) => {
       // every object_item's traverse result is an object, we need to merge them
-      const result: { [key: string]: unknown } = {};
+      const result: Record<string, unknown> = {};
       $$(`object_item`).forEach((item) => {
+        // traverse the child object_item to get the value, then merge the result
         Object.assign(result, item.traverse());
       });
       return result;
@@ -97,10 +110,10 @@ export const builder = new ELR.AdvancedBuilder()
     { object_item: `string@key ':' value` },
     // return an object
     (d) =>
-      // use `$` to select the first child with the given kind
+      // use `$` to select the first child with the given name
       d.traverser(({ $ }) => {
-        const result: { [key: string]: unknown } = {};
-        // remove the double quotes in the key string
+        const result: Record<string, unknown> = {};
+        // remove the double quotes in the key string, then traverse child to get the value
         result[$(`key`)!.text!.slice(1, -1)] = $(`value`)!.traverse();
         return result;
       }),
@@ -154,44 +167,6 @@ export const builder = new ELR.ParserBuilder()
 
 </details>
 
-### [Function Definition](https://github.com/DiscreteTom/retsac/blob/main/examples/parser/advanced-builder/advanced-builder.ts)
-
-This example shows you how to define a simple `fn_def` grammar rule if you want to build a programming language compiler.
-
-<details><summary>Click to Expand</summary>
-
-```ts
-const lexer = new Lexer.Builder()
-  .ignore(Lexer.whitespaces()) // ignore blank chars
-  .define(Lexer.wordKind("pub", "fn", "return", "let")) // keywords
-  .define({
-    integer: /([1-9][0-9]*|0)/,
-    identifier: /[a-zA-Z_]\w*/,
-  })
-  .anonymous(Lexer.exact(..."+-*/():{};=,")) // single char operator
-  .build();
-
-export const builder = new ELR.AdvancedBuilder()
-  .lexer(lexer)
-  .define({
-    // use `@` to rename a node
-    fn_def: `
-      pub fn identifier@funcName '(' (param (',' param)*)? ')' ':' identifier@retType '{'
-        stmt*
-      '}'
-    `,
-  })
-  .define({ param: `identifier ':' identifier` })
-  .define({ stmt: `assign_stmt | ret_stmt` }, (d) => d.commit()) // commit to prevent re-lex, optimize performance
-  .define({ assign_stmt: `let identifier ':' identifier '=' exp ';'` })
-  .define({ ret_stmt: `return exp ';'` })
-  .define({ exp: `integer | identifier` })
-  .define({ exp: `exp '+' exp` })
-  .priority({ exp: `exp '+' exp` });
-```
-
-</details>
-
 ## Contribute
 
 All issues and pull requests are highly welcomed.

diff --git a/README.src.md b/README.src.md
@@ -6,13 +6,15 @@
 ![license](https://img.shields.io/github/license/DiscreteTom/retsac?style=flat-square)
 [![Visual Studio Marketplace Version](https://img.shields.io/visual-studio-marketplace/v/DiscreteTom.vscode-retsac?label=VSCode%20extension&style=flat-square)](https://marketplace.visualstudio.com/items?itemName=DiscreteTom.vscode-retsac)
 
-> **Warning**
+> [!WARNING]
 > This project is still in early development stage, the API may change frequently.
 
 Text lexer and parser. Compiler frontend framework.
 
 This can be used to **_fast prototype_** your own programming language compiler/translator frontend, or parse your domain specific language.
 
+Try it online in the [playground](https://dttk.discretetom.com/js-playground?crushed=%28%27XpenXncieV%27https%253A%252F%252Fcdn.jsXlivr.net%252Fnpm%252FN%25400.15.0%252Fdist%252FN.min.js%27%255D%7EcellVHPYpaY9ULJ%252C%2520ELRI6NOtrue%7Eid%210%29%252CHWrite%2520the%2520PKr9lJZLJ.QXfine%257BUaM%252F123%252F_q%253B--GUpKrIZELR.AdvancedQlJ%257BlJ*XfineD%255C%27a%255C%27_D%2522entry%2522%252C%2520checkAllMtrueI%257D%253BC4418%29%252CHPK9Ys6pKr.pKAll%257B%2522123%2522%257D-Groot6Ys.buffer%255B0%255D--console.log%257Broot.toTYeStringq%257DC5544%29%255D%7EpanelVF5544%252CF4418%255D%29*%257D-zz.-%255Cr%255Cn6%2520%253D%25209%27%7EcoX%21%27GCOfalse%7Eid%21FD%257BUentryMF170372543Gconst%2520H%28%27name%21%27I%2520%29JexerKarseM%253A%2520NYtsacO%27%7EYadonly%21QBuilXr%257B*U%28%2520Vs%21%255BXdeYreZ6new%2520_I*buildq%257B%257Dz%2520%2520%2501zq_ZYXVUQONMKJIHGFDC96-*_).
+
 ## Installation
 
 ```bash
@@ -21,29 +23,33 @@ yarn add retsac
 
 ## Features
 
-- The Lexer, turns a text string to a [token](https://github.com/DiscreteTom/retsac/blob/main/src/lexer/model.ts) list.
-  - Regex support. See [examples](https://github.com/DiscreteTom/retsac#examples) below.
+- The Lexer, yield [token](https://github.com/DiscreteTom/retsac/blob/main/src/lexer/model.ts) from the text input string.
+  - Regex support. See [examples](#examples) below.
   - [Built-in util functions](https://github.com/DiscreteTom/retsac/blob/main/src/lexer/utils).
-  - Support custom functions to yield tokens from the input string.
+    - JavaScript's string literal, numeric literal, integer literal, identifier, etc.
+    - JSON's string literal, numeric literal.
+  - Support custom functions.
 - The Parser, co-work with the lexer and produce an [AST (Abstract Syntax Tree)](https://github.com/DiscreteTom/retsac/blob/main/src/parser/ast.ts).
   - ELR(Expectational LR) parser.
     - **_Meta characters_** like `+*?` when defining a grammar rule.
     - **_Conflict detection_**, try to **_auto resolve conflicts_**.
-    - Query children nodes by using `$('name')` instead of `children[0]`.
+    - Query children nodes by using `$('name')` instead of `children[index]`.
     - Top-down traverse the AST.
     - Bottom-up reduce data.
     - Expect lexer to yield specific token type and/or content.
     - Try to **_re-lex_** the input if parsing failed.
-    - **_DFA serialization_** to accelerate future building.
-  - Serializable AST to co-work with other tools (e.g. compiler backend libs).
+    - **_DFA serialization & hydration_** to accelerate future building.
+  - Serializable AST to co-work with other tools (e.g. compiler backend libs like LLVM).
 - Strict type checking with TypeScript.
-  - Including string literal type checking for token kinds and grammar kinds.
+  - _This is amazing, you'd better try this out by yourself._
 
 ## Resources
 
-- [Documentation & API reference. (WIP)](https://discretetom.github.io/retsac/)
-- [VSCode extension.](https://github.com/DiscreteTom/vscode-retsac)
-- [Demo programming language which compiles to WebAssembly.](https://github.com/DiscreteTom/dt0)
+- [Documentation & API reference. (Deprecated. Working on a new one.)](https://discretetom.github.io/retsac/)
+- [A demo programming language which compiles to WebAssembly.](https://github.com/DiscreteTom/dt0)
+- [Build tmLanguage.json file in TypeScript with `tmlb`.](https://github.com/DiscreteTom/tmlb)
+- [Compose `RegExp` in JavaScript in a readable and maintainable way with `r-compose`.](https://github.com/DiscreteTom/r-compose)
+<!-- - [VSCode extension.](https://github.com/DiscreteTom/vscode-retsac) -->
 
 ## [Examples](https://github.com/DiscreteTom/retsac/tree/main/examples)
 
@@ -55,7 +61,7 @@ All conflicts are auto resolved.
 
 <details open>
 <summary>Click to Expand</summary>
-<include path="./examples/parser/json/json.ts" from="6" to="54" />
+<include path="./examples/parser/json/json.ts" from="6" to="61" />
 </details>
 
 ### [Calculator](https://github.com/DiscreteTom/retsac/blob/main/examples/parser/calculator/calculator.ts)
@@ -69,15 +75,6 @@ There are conflicts introduced by those grammar rules, we use the high-level res
 <include path="./examples/parser/calculator/calculator.ts" from="8" to="39" />
 </details>
 
-### [Function Definition](https://github.com/DiscreteTom/retsac/blob/main/examples/parser/advanced-builder/advanced-builder.ts)
-
-This example shows you how to define a simple `fn_def` grammar rule if you want to build a programming language compiler.
-
-<details>
-<summary>Click to Expand</summary>
-<include path="./examples/parser/advanced-builder/advanced-builder.ts" from="8" to="34" />
-</details>
-
 ## Contribute
 
 All issues and pull requests are highly welcomed.

diff --git a/examples/lexer/lexer-error-handling/lexer-error-handling.test.ts b/examples/lexer/lexer-error-handling/lexer-error-handling.test.ts
@@ -21,18 +21,14 @@ describe("lexer error handling", () => {
     expect(token1!.error).toBeUndefined();
 
     const token2 = lexer.reset().lex(`12e34e56`);
-    expect(token2!.content).toBe(`12e34e56`);
+    expect(token2!.content).toBe(`12e34`);
     expect(token2!.error).toBe("invalid numeric literal");
   });
 
   test("invalid identifier", () => {
     const token1 = lexer.reset().lex(`abc`);
     expect(token1!.content).toBe(`abc`);
     expect(token1!.error).toBeUndefined();
-
-    const token2 = lexer.reset().lex(`123abc`);
-    expect(token2!.content).toBe(`123abc`);
-    expect(token2!.error).toBe("identifier should not starts with a number");
   });
 
   test("fallback handler", () => {

diff --git a/examples/lexer/lexer-error-handling/lexer-error-handling.ts b/examples/lexer/lexer-error-handling/lexer-error-handling.ts
@@ -10,20 +10,20 @@ export const lexer = new Lexer.Builder()
     // built-in utils will check common errors and flag it in output.data
     // you can set error by checking output.data
     string: Lexer.stringLiteral(`"`).check(({ output }) =>
-      output.data.unclosed ? ("unclosed string literal" as string) : undefined,
+      output.data.unclosed ? "unclosed string literal" : undefined,
     ),
     number: Lexer.javascript
       .numericLiteral()
       .check(({ output }) =>
-        output.data.invalid ? ("invalid numeric literal" as string) : undefined,
+        output.data.invalid ? "invalid numeric literal" : undefined,
       ),
     // you can customize your own error handling function using `check`
     identifier: (a) =>
       a
         .from(/\w+/)
         .check(({ output }) =>
           output.content.match(/\d/)
-            ? ("identifier should not starts with a number" as string)
+            ? "identifier should not starts with a number"
             : undefined,
         ),
   })

diff --git a/examples/lexer/string-literal/string-literal.test.ts b/examples/lexer/string-literal/string-literal.test.ts
@@ -12,25 +12,10 @@ test("basic string", () => {
   assertEq("`123`");
 });
 
-test("escaped string", () => {
-  assertEq(`'123\\'456'`);
-  assertEq(`'123\\\\'`);
-  assertEq(`"123\\"456"`);
-  assertEq(`"123\\\\"`);
-  assertEq("`123\n456`");
-  assertEq("`123\\`456`");
-  assertEq("`123\\\\`");
-});
-
 test("custom quotes", () => {
   assertEq(`*123*`);
-  assertEq(`*123\\*456*`);
-  assertEq(`*123\\\\*`);
 });
 
 test("custom boundary", () => {
   assertEq(`^123$$`);
-  assertEq(`^123\\$$456$$`);
-  assertEq(`^123$\\$456$$`);
-  assertEq(`^123\\\\$$`);
 });
diff --git a/examples/parser/ignore-entry-follow/ignore-entry-follow.test.ts b/examples/parser/ignore-entry-follow/ignore-entry-follow.test.ts
@@ -148,7 +148,7 @@ test("with ignoreEntryFollow", () => {
   // now we can use `parse` to get the first top-level statement
   let res = parser.reset().parse("fn foo(); fn bar();");
   expect(res.accept).toBe(true);
-  expect(parser.lexer.getRest()).toBe(" fn bar();");
+  expect(parser.lexer.getRest()).toBe("fn bar();");
   // and take it out from the parser buffer
   parser.take(1);
   // then parse the second top-level statement
@@ -207,7 +207,7 @@ test("abuse", () => {
   // thus the `b` will never be parsed
   res = parser.reset().parse("a b");
   expect(res.accept).toBe(true);
-  expect(parser.lexer.getRest()).toBe(" b");
+  expect(parser.lexer.getRest()).toBe("b");
 
   // as you can see, if the entry NT can be early accepted if we ignore entry follow,
   // we shouldn't use `ignoreEntryFollow`