Skip to content

Commit

Permalink
chore(fe): upgrade to Unicode 15.1
Browse files Browse the repository at this point in the history
This fixes failing Test_Lex_Unicode tests on Arch Linux (which has a
newer icu4c which has newer Unicode tables).
  • Loading branch information
strager committed Dec 20, 2023
1 parent 13740a1 commit 2f5df3f
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 52 deletions.
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Semantic Versioning.

### Added

* Unicode 15.1 is now supported, including new CJK code points in identifiers.
* Missing `break;` (or `return;` or `throw ...;` or `// fallthrough`) after a
clause in `switch` statement now reports [E0427][] ("missing 'break;' or '//
fallthrough' comment between statement and 'case'"). (Implemented by [Yash
Expand Down
74 changes: 39 additions & 35 deletions src/quick-lint-js/fe/lex-unicode-generated.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,11 @@ const std::uint8_t Lexer::unicode_tables_chunks[] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
Expand Down Expand Up @@ -584,7 +588,7 @@ const std::uint8_t Lexer::unicode_tables_chunks[] = {
0xe0, 0x00, 0x00, 0x00, 0xfe, 0xff, 0x3e, 0x1f,
0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0x7f, 0xfe, 0xfe, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf7,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0x1f, 0xff, 0xff, 0xff, 0x0f, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf0, 0xbf,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
Expand Down Expand Up @@ -614,7 +618,7 @@ const std::uint8_t Lexer::unicode_tables_chunks[] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f,
0x00, 0x00, 0xff, 0x03, 0xfe, 0xff, 0xff, 0x87,
0xfe, 0xff, 0xff, 0x07, 0xc0, 0xff, 0xff, 0xff,
0xfe, 0xff, 0xff, 0x07, 0xe0, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f,
0xfc, 0xfc, 0xfc, 0x1c, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
Expand Down Expand Up @@ -877,25 +881,25 @@ const Lexer::Unicode_Table_Chunk_Index_Type Lexer::identifier_start_chunk_indexe
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x71, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x01, 0x71, 0x01, 0x01, 0x72, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x72, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x73, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x73, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x74, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x74,
0x01, 0x01, 0x01, 0x75,
};
static_assert(Lexer::identifier_start_chunk_indexes_size == sizeof(Lexer::identifier_start_chunk_indexes));

const Lexer::Unicode_Table_Chunk_Index_Type Lexer::identifier_part_chunk_indexes[] = {
0x75, 0x01, 0x02, 0x76, 0x77, 0x78, 0x79, 0x7a,
0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82,
0x83, 0x01, 0x11, 0x84, 0x13, 0x01, 0x14, 0x85,
0x86, 0x87, 0x88, 0x89, 0x8a, 0x01, 0x01, 0x1c,
0x8b, 0x1e, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x8c, 0x8d, 0x1f, 0x1f,
0x8e, 0x23, 0x1f, 0x1f, 0x01, 0x01, 0x01, 0x01,
0x76, 0x01, 0x02, 0x77, 0x78, 0x79, 0x7a, 0x7b,
0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
0x84, 0x01, 0x11, 0x85, 0x13, 0x01, 0x14, 0x86,
0x87, 0x88, 0x89, 0x8a, 0x8b, 0x01, 0x01, 0x1c,
0x8c, 0x1e, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x8d, 0x8e, 0x1f, 0x1f,
0x8f, 0x23, 0x1f, 0x1f, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x1b, 0x01, 0x01,
Expand All @@ -909,8 +913,8 @@ const Lexer::Unicode_Table_Chunk_Index_Type Lexer::identifier_part_chunk_indexes
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x24, 0x01, 0x8f, 0x26,
0x90, 0x91, 0x92, 0x93, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x24, 0x01, 0x90, 0x26,
0x91, 0x92, 0x93, 0x94, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
Expand All @@ -920,21 +924,21 @@ const Lexer::Unicode_Table_Chunk_Index_Type Lexer::identifier_part_chunk_indexes
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x01, 0x2c, 0x94, 0x01, 0x2e, 0x95, 0x96,
0x31, 0x97, 0x98, 0x99, 0x9a, 0x36, 0x01, 0x37,
0x38, 0x39, 0x9b, 0x3b, 0x3c, 0x9c, 0x9d, 0x9e,
0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6,
0xa7, 0xa8, 0xa9, 0x1f, 0xaa, 0xab, 0xac, 0xad,
0x1f, 0x01, 0x2c, 0x95, 0x01, 0x2e, 0x96, 0x97,
0x31, 0x98, 0x99, 0x9a, 0x9b, 0x36, 0x01, 0x37,
0x38, 0x39, 0x9c, 0x3b, 0x3c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xaa, 0x1f, 0xab, 0xac, 0xad, 0xae,
0x01, 0x01, 0x01, 0x4f, 0x50, 0x51, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x52,
0x01, 0x01, 0x01, 0x01, 0xae, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x01, 0x01, 0xaf, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x01, 0x01, 0x54, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0xaf, 0xb0, 0x1f, 0x1f, 0x57, 0xb1,
0x01, 0x01, 0xb0, 0xb1, 0x1f, 0x1f, 0x57, 0xb2,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x59,
Expand All @@ -944,15 +948,15 @@ const Lexer::Unicode_Table_Chunk_Index_Type Lexer::identifier_part_chunk_indexes
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x5c,
0x01, 0x5d, 0x5e, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0xb2, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0xb3, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0xb3,
0x1f, 0xb4, 0xb5, 0x1f, 0x60, 0x61, 0x62, 0xb6,
0x1f, 0x1f, 0xb7, 0x1f, 0x1f, 0x1f, 0x1f, 0x64,
0xb8, 0xb9, 0xba, 0x1f, 0xbb, 0x1f, 0x1f, 0x69,
0xbc, 0xbd, 0x1f, 0x1f, 0x1f, 0x1f, 0x6c, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0xb4,
0x1f, 0xb5, 0xb6, 0x1f, 0x60, 0x61, 0x62, 0xb7,
0x1f, 0x1f, 0xb8, 0x1f, 0x1f, 0x1f, 0x1f, 0x64,
0xb9, 0xba, 0xbb, 0x1f, 0xbc, 0x1f, 0x1f, 0x69,
0xbd, 0xbe, 0x1f, 0x1f, 0x1f, 0x1f, 0x6c, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0xbe, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0xbf, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
Expand Down Expand Up @@ -982,14 +986,14 @@ const Lexer::Unicode_Table_Chunk_Index_Type Lexer::identifier_part_chunk_indexes
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x71, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x01, 0x71, 0x01, 0x01, 0x72, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x72, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x73, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x73, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x74, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x74, 0x1f, 0x1f, 0x1f, 0x1f,
0x01, 0x01, 0x01, 0x75, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
Expand Down Expand Up @@ -1337,7 +1341,7 @@ const Lexer::Unicode_Table_Chunk_Index_Type Lexer::identifier_part_chunk_indexes
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0xbf,
0x1f, 0xc0,
};
static_assert(Lexer::identifier_part_chunk_indexes_size == sizeof(Lexer::identifier_part_chunk_indexes));
}
Expand Down
11 changes: 7 additions & 4 deletions test/test-lex-unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@ std::string pretty(char32_t c) {
}

bool icu_data_is_valid() {
std::uint8_t minimum_unicode_version = 15;
std::uint8_t minimum_unicode_major_version = 15;
std::uint8_t minimum_unicode_minor_version = 1;

UVersionInfo version;
::u_getUnicodeVersion(version);
if (version[0] >= minimum_unicode_version) {
if (version[0] > minimum_unicode_major_version ||
(version[0] == minimum_unicode_major_version &&
version[1] >= minimum_unicode_minor_version)) {
return true;
}

Expand All @@ -37,9 +40,9 @@ bool icu_data_is_valid() {
std::fprintf(stderr,
"warning: The ICU library has data for Unicode version "
"%u.%u.%u.%u, which is too old. Upgrade ICU to Unicode "
"version %u or newer. Skipping tests...\n",
"version %u.%u or newer. Skipping tests...\n",
version[0], version[1], version[2], version[3],
minimum_unicode_version);
minimum_unicode_major_version, minimum_unicode_minor_version);
did_log_warning = true;
}
return false;
Expand Down
18 changes: 9 additions & 9 deletions tools/generate-lex-unicode/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -169,34 +169,34 @@ function isIDContinue(codePoint) {
}

let ID_START_CODE_POINTS = new Set(
require("@unicode/unicode-15.0.0/Binary_Property/ID_Start/code-points.js")
require("@unicode/unicode-15.1.0/Binary_Property/ID_Start/code-points.js")
);
let ID_CONTINUE_CODE_POINTS = new Set(
require("@unicode/unicode-15.0.0/Binary_Property/ID_Continue/code-points.js")
require("@unicode/unicode-15.1.0/Binary_Property/ID_Continue/code-points.js")
);

// Pattern_White_Space
// https://www.unicode.org/Public/11.0.0/ucd/PropList.txt
// https://www.unicode.org/Public/15.1.0/ucd/PropList.txt
let PATTERN_WHITE_SPACE_CODE_POINTS = new Set([
0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x0020, 0x0085, 0x200e, 0x200f,
0x2028, 0x2029,
]);

// Other_ID_Start
// https://www.unicode.org/Public/11.0.0/ucd/PropList.txt
// https://www.unicode.org/Public/15.1.0/ucd/PropList.txt
let OTHER_ID_START_CODE_POINTS = new Set([
0x1885, 0x1886, 0x2118, 0x212e, 0x309b, 0x309c,
]);

// Other_ID_Continue
// https://www.unicode.org/Public/11.0.0/ucd/PropList.txt
// https://www.unicode.org/Public/15.1.0/ucd/PropList.txt
let OTHER_ID_CONTINUE_CODE_POINTS = new Set([
0x00b7, 0x0387, 0x1369, 0x136a, 0x136b, 0x136c, 0x136d, 0x136e, 0x136f,
0x1370, 0x1371, 0x19da,
0x1370, 0x1371, 0x19da, 0x200c, 0x200d, 0x30fb, 0xff65,
]);

// Pattern_Syntax
// https://www.unicode.org/Public/11.0.0/ucd/PropList.txt
// https://www.unicode.org/Public/15.1.0/ucd/PropList.txt
let PATTERN_SYNTAX_CODE_POINTS = new Set([
0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,
0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a, 0x003b, 0x003c,
Expand Down Expand Up @@ -537,6 +537,8 @@ function testIsIDStart() {
assert.ok(isIDStart(0x309b)); // KATAKANA-HIRAGANA VOICED SOUND MARK
assert.ok(isIDStart(0x309c)); // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK

assert.ok(isIDStart(0x2ee5d));

// Pattern_Syntax (disallowed)
for (let codePoint of PATTERN_SYNTAX_CODE_POINTS) {
assert.ok(!isIDStart(codePoint), codePoint.toString(16));
Expand Down Expand Up @@ -618,8 +620,6 @@ function testIsIDContinue() {
assert.ok(!isIDContinue(0x20a0)); // EURO-CURRENCY SIGN (Sc)
assert.ok(!isIDContinue(0x0024)); // DOLLAR SIGN (Sc)
assert.ok(!isIDContinue(0x20dd)); // COMBINING ENCLOSING CIRCLE (Me)
assert.ok(!isIDContinue(0x200c)); // ZERO WIDTH NON-JOINER (Cf)
assert.ok(!isIDContinue(0x200d)); // ZERO WIDTH JOINER (Cf)
assert.ok(!isIDContinue(0x202c)); // POP DIRECTIONAL FORMATTING (Cf)
}

Expand Down
3 changes: 2 additions & 1 deletion tools/generate-lex-unicode/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"fmt": "prettier --write '*.js' '*.json'"
},
"dependencies": {
"@unicode/unicode-15.0.0": "*"
"@unicode/unicode-15.0.0": "*",
"@unicode/unicode-15.1.0": "*"
},
"devDependencies": {
"prettier": "^2.8.4"
Expand Down
11 changes: 8 additions & 3 deletions tools/generate-lex-unicode/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@


"@unicode/unicode-15.0.0@*":
version "1.3.1"
resolved "https://registry.yarnpkg.com/@unicode/unicode-15.0.0/-/unicode-15.0.0-1.3.1.tgz#6d2d9b4ac73a5227122ede626a7977556decf81d"
integrity sha512-zxm5Cx0v9vGxFOM8tVuArWHxxJTk+stiLA+ZHKt2mJO3HHmM6uN8OFcDGuvcix3MqguQ75am0XvpUgEz4P4vFw==
version "1.5.2"
resolved "https://registry.yarnpkg.com/@unicode/unicode-15.0.0/-/unicode-15.0.0-1.5.2.tgz#5350ad022050ca5e165f8352d9f971d418c99deb"
integrity sha512-PepMvMxf9j4sp4bZn7W9JJoMxynHk66ZPCsx6n3v47T5vmM+qfIy0z1MMU+EDmRZr2cvs1aT9ZwUEMRPVXR23g==

"@unicode/unicode-15.1.0@*":
version "1.5.2"
resolved "https://registry.yarnpkg.com/@unicode/unicode-15.1.0/-/unicode-15.1.0-1.5.2.tgz#0358d05ab99a7d05c12c68cc2123cf124c34beb9"
integrity sha512-7PAgnShDr8ziK6XeHB/TUVFboDFEhaQKKyrw55/Kx9o6AQDy1s7dJ9KRpRerW9nrR5qMGUQvOqTXOAek6ZIXkg==

prettier@^2.8.4:
version "2.8.4"
Expand Down

0 comments on commit 2f5df3f

Please sign in to comment.