diff --git a/CHANGELOG.md b/CHANGELOG.md index fb7d5e0b..d5203ceb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Breaking changes: +- [#424]: `any` now consumes an entire code point (i.e., a full Unicode character), not just a single, 16-bit code unit. - [55c787b]: The namespace helpers (`namespace`, `extendNamespace`) have been removed. (These were always optional.) - [bea0be9]: When used as an ES module, the main 'ohm-js' module now has _only_ named exports (i.e., no default export). The same is true for `ohm-js/extras`. - [#395]: In generated type definitions, action dictionary types now inherit from `BaseActionDict`, a new supertype of `ActionDict`. diff --git a/doc/releases/ohm-js-17.0.md b/doc/releases/ohm-js-17.0.md index a350f027..637f0220 100644 --- a/doc/releases/ohm-js-17.0.md +++ b/doc/releases/ohm-js-17.0.md @@ -6,6 +6,24 @@ This version also has experimental support for indentation-sensitive grammars. ## Upgrading +### `any` now consumes a full code point + +In JavaScript, a string is a sequence of 16-bit code units. Some Unicode characters, such as emoji, are encoded as pairs of 16-bit values. For example, the string 'πŸ˜†' has length 2, but contains a single Unicode code point. Previously, `any` matched a single 16-bit code unit β€” even if that unit was part of a surrogate pair. In v17, `any` now matches a full Unicode character. + +Old behaviour: + +```js +const g = ohm.grammar('OneChar { start = any }'); +g.match('πŸ˜†').succeeded(); // false +``` + +New behaviour (Ohm v17+): + +```js +const g = ohm.grammar('OneChar { start = any }'); +g.match('πŸ˜†').succeeded(); // true +``` + ### Namespace helpers removed The top-level `namespace` and `extendNamespace` functions have been removed. They were never required β€” it was always possible to use a plain old object in any API that asked for a namespace. diff --git a/doc/syntax-reference.md b/doc/syntax-reference.md index 552cd6db..d50bc788 100644 --- a/doc/syntax-reference.md +++ b/doc/syntax-reference.md @@ -146,7 +146,9 @@ as well as multiline (`/* */`) comments like: (See [src/built-in-rules.ohm](https://github.com/harc/ohm/blob/main/packages/ohm-js/src/built-in-rules.ohm).) -`any`: Matches the next character in the input stream, if one exists. +`any`: Matches the next Unicode character β€” i.e., a single code point β€”Β in the input stream, if one exists. + +**NOTE:** A JavaScript string is a sequence of 16-bit _code units_. Some Unicode characters, such as emoji, are encoded as pairs of 16-bit values. For example, the string `'πŸ˜†'` has length 2, but contains a single Unicode code point. Prior to Ohm v17, `any` always consumed a single 16-bit code unit, rather than a full Unicode character. `letter`: Matches a single character which is a letter (either uppercase or lowercase). diff --git a/packages/ohm-js/package.json b/packages/ohm-js/package.json index 390501a6..ca133d91 100644 --- a/packages/ohm-js/package.json +++ b/packages/ohm-js/package.json @@ -1,6 +1,6 @@ { "name": "ohm-js", - "version": "17.0.0", + "version": "17.0.1", "description": "An object-oriented language for parsing and pattern matching", "repository": "https://github.com/harc/ohm", "keywords": [ diff --git a/packages/ohm-js/src/pexprs-eval.js b/packages/ohm-js/src/pexprs-eval.js index 231c7fff..32f74727 100644 --- a/packages/ohm-js/src/pexprs-eval.js +++ b/packages/ohm-js/src/pexprs-eval.js @@ -29,9 +29,9 @@ pexprs.PExpr.prototype.eval = common.abstract('eval'); // function(state) { ... pexprs.any.eval = function(state) { const {inputStream} = state; const origPos = inputStream.pos; - const ch = inputStream.next(); - if (ch) { - state.pushBinding(new TerminalNode(ch.length), origPos); + const cp = inputStream.nextCodePoint(); + if (cp !== undefined) { + state.pushBinding(new TerminalNode(String.fromCodePoint(cp).length), origPos); return true; } else { state.processFailure(origPos, this); diff --git a/packages/ohm-js/test/test-ohm-syntax.js b/packages/ohm-js/test/test-ohm-syntax.js index 1f3c60fc..e619ccba 100644 --- a/packages/ohm-js/test/test-ohm-syntax.js +++ b/packages/ohm-js/test/test-ohm-syntax.js @@ -256,6 +256,28 @@ test('ranges w/ code points > 0xFFFF, special cases', t => { assertSucceeds(t, g2.match('\u{D83D}x')); }); +test('any consumes an entire code point', t => { + const g = ohm.grammar('G { start = any any }'); + const re = /../u; // The regex equivalent of `any any`. + + t.is('πŸ˜‡'.length, 2); + t.is('πŸ˜‡!'.length, 3); + t.is('πŸ˜‡πŸ˜‡'.length, 4); + + t.is(g.match('πŸ˜‡πŸ˜‡').succeeded(), true); + t.truthy(re.exec('πŸ˜‡πŸ˜‡')); + + t.is(g.match('πŸ˜‡!').succeeded(), true); + t.truthy(re.exec('πŸ˜‡!')); + + t.is(g.match('!πŸ˜‡').succeeded(), true); + t.truthy(re.exec('!πŸ˜‡')); + + t.is('πŸ‘‹πŸΏ'.length, 4); // Skin color modifier is a separate code point. + t.is(g.match('πŸ‘‹πŸΏ').succeeded(), true); + t.truthy(re.exec('πŸ‘‹πŸΏ')); +}); + describe('alt', test => { const m = ohm.grammar('M { altTest = "a" | "b" }'); const s = m.createSemantics().addAttribute('v', {