Skip to content

Commit

Permalink
feat(fe): error on ';' confusables; treat as ';'
Browse files Browse the repository at this point in the history
GREEK QUESTION MARK (U+037E) looks like a semicolon:

      ; GREEK QUESTION MARK
      ; SEMICOLON

Report a more helpful diagnostic when a Greek question mark is
encountered in code. Also, treat the symbol as a proper semicolon
during parsing.
  • Loading branch information
strager committed Mar 3, 2024
1 parent a429162 commit d09997c
Show file tree
Hide file tree
Showing 15 changed files with 128 additions and 15 deletions.
2 changes: 2 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ Semantic Versioning.
* VS Code: You can now make quick-lint-js messages fun and insulting with the
`quick-lint-js.snarky` setting (disabled by default). (Implemented by
[vegerot][].)
* Using Greek question mark (;, U+037E) instead of a semicolon (;, U+003B) now
reports [E0457][] ("this is a Greek Question Mark, not a semicolon (';')").
* TypeScript: Decorators on abstract classes are now parsed. ([#1194][])

### Fixed
Expand Down
4 changes: 4 additions & 0 deletions po/messages.pot
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,10 @@ msgstr ""
msgid "unknown JSX mode; try \"none\" or \"react\""
msgstr ""

#: src/quick-lint-js/diag/diagnostic-metadata-generated.cpp
msgid "this is a {1}, not a {2} ('{3}')"
msgstr ""

#: src/quick-lint-js/diag/diagnostic-metadata-generated.cpp
msgid "depth limit exceeded"
msgstr ""
Expand Down
2 changes: 2 additions & 0 deletions src/quick-lint-js/diag/diagnostic-formatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ inline void Diagnostic_Formatter<Derived>::format_message(
expanded_parameter = this->expand_argument_singular(args, diagnostic, 1);
} else if (curly_content == u8"2"_sv) {
expanded_parameter = this->expand_argument(args, diagnostic, 2);
} else if (curly_content == u8"3"_sv) {
expanded_parameter = this->expand_argument(args, diagnostic, 3);
} else {
QLJS_ASSERT(false && "invalid message format: unrecognized placeholder");
QLJS_UNREACHABLE();
Expand Down
17 changes: 17 additions & 0 deletions src/quick-lint-js/diag/diagnostic-metadata-generated.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1392,6 +1392,23 @@ const QLJS_CONSTINIT Diagnostic_Info all_diagnostic_infos[] = {
},
},

// Diag_Confusable_Symbol
{
.code = 457,
.severity = Diagnostic_Severity::error,
.message_formats = {
QLJS_TRANSLATABLE("this is a {1}, not a {2} ('{3}')"),
},
.message_args = {
{
Diagnostic_Message_Arg_Info(offsetof(Diag_Confusable_Symbol, confusable), Diagnostic_Arg_Type::source_code_span),
Diagnostic_Message_Arg_Info(offsetof(Diag_Confusable_Symbol, confusable_name), Diagnostic_Arg_Type::string8_view),
Diagnostic_Message_Arg_Info(offsetof(Diag_Confusable_Symbol, symbol_name), Diagnostic_Arg_Type::string8_view),
Diagnostic_Message_Arg_Info(offsetof(Diag_Confusable_Symbol, symbol), Diagnostic_Arg_Type::char8),
},
},
},

// Diag_Depth_Limit_Exceeded
{
.code = 203,
Expand Down
3 changes: 2 additions & 1 deletion src/quick-lint-js/diag/diagnostic-metadata-generated.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ namespace quick_lint_js {
QLJS_DIAG_TYPE_NAME(Diag_Config_Globals_Type_Mismatch) \
QLJS_DIAG_TYPE_NAME(Diag_Config_JSX_Mode_Type_Mismatch) \
QLJS_DIAG_TYPE_NAME(Diag_Config_JSX_Mode_Unrecognized) \
QLJS_DIAG_TYPE_NAME(Diag_Confusable_Symbol) \
QLJS_DIAG_TYPE_NAME(Diag_Depth_Limit_Exceeded) \
QLJS_DIAG_TYPE_NAME(Diag_Dot_Not_Allowed_After_Generic_Arguments_In_Type) \
QLJS_DIAG_TYPE_NAME(Diag_Dot_Dot_Is_Not_An_Operator) \
Expand Down Expand Up @@ -475,7 +476,7 @@ namespace quick_lint_js {
/* END */
// clang-format on

inline constexpr int Diag_Type_Count = 461;
inline constexpr int Diag_Type_Count = 462;

extern const Diagnostic_Info all_diagnostic_infos[Diag_Type_Count];
}
Expand Down
10 changes: 10 additions & 0 deletions src/quick-lint-js/diag/diagnostic-types-2.h
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,16 @@ struct Diag_Config_JSX_Mode_Unrecognized {
Source_Code_Span value;
};

struct Diag_Confusable_Symbol {
[[qljs::diag("E0457", Diagnostic_Severity::error)]] //
[[qljs::message("this is a {1}, not a {2} ('{3}')", ARG(confusable),
ARG(confusable_name), ARG(symbol_name), ARG(symbol))]] //
Source_Code_Span confusable;
String8_View confusable_name;
Char8 symbol;
String8_View symbol_name;
};

struct Diag_Depth_Limit_Exceeded {
[[qljs::diag("E0203", Diagnostic_Severity::error)]] //
[[qljs::message("depth limit exceeded", ARG(token))]] //
Expand Down
2 changes: 1 addition & 1 deletion src/quick-lint-js/diag/diagnostic.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ struct Diagnostic_Message_Arg_Info {
Diagnostic_Arg_Type type : (8 - offset_bits) QLJS_WORK_AROUND_GCC_BUG_105191;
};

using Diagnostic_Message_Args = std::array<Diagnostic_Message_Arg_Info, 3>;
using Diagnostic_Message_Args = std::array<Diagnostic_Message_Arg_Info, 4>;

struct Diagnostic_Info {
std::array<char, 5> code_string() const;
Expand Down
59 changes: 51 additions & 8 deletions src/quick-lint-js/fe/lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,20 @@ constexpr char32_t left_double_quote = U'\u201c';
constexpr char32_t right_single_quote = U'\u2019';
constexpr char32_t right_double_quote = U'\u201d';

struct Confusable_Symbol {
char32_t confusable;
Char8 confusable_name[20];
Char8 symbol;
Char8 symbol_name[20];
Token_Type symbol_token_type;
};

Confusable_Symbol confusable_symbols[] = {
{0x037e, u8"Greek Question Mark", u8';', u8"semicolon",
Token_Type::semicolon},
// TODO(strager): Add more.
};

bool look_up_in_unicode_table(const std::uint8_t* table, std::size_t table_size,
char32_t code_point) {
constexpr int bits_per_byte = 8;
Expand Down Expand Up @@ -1817,7 +1831,9 @@ Lexer::Parsed_Identifier Lexer::parse_identifier_slow(
: this->is_identifier_character(code_point, kind);
if (!is_legal_character) {
if (this->is_ascii_character(code_point) ||
this->is_non_ascii_whitespace_character(code_point)) {
this->is_non_ascii_whitespace_character(code_point) ||
// Confusable symbols are handled by parse_non_ascii.
this->is_confusable_symbol_character(code_point)) {
break;
} else {
this->diag_reporter_->report(Diag_Character_Disallowed_In_Identifiers{
Expand Down Expand Up @@ -1850,21 +1866,39 @@ QLJS_WARNING_POP
void Lexer::parse_non_ascii() {
Decode_UTF8_Result character = decode_utf_8(Padded_String_View(
this->input_, this->original_input_.null_terminator()));
// FIXME(strager): We probably need to check character.ok.

if (character.code_point == left_single_quote ||
character.code_point == right_single_quote ||
character.code_point == left_double_quote ||
character.code_point == right_double_quote) {
this->input_ = this->parse_smart_quote_string_literal(character);
this->last_token_.type = Token_Type::string;
this->last_token_.end = this->input_;
} else {
Parsed_Identifier ident = this->parse_identifier_slow(
this->input_, this->input_, Identifier_Kind::javascript);
this->input_ = ident.after;
this->last_token_.normalized_identifier = ident.normalized;
this->last_token_.end = ident.after;
this->last_token_.type = Token_Type::identifier;
return;
}

for (const Confusable_Symbol& confusable : confusable_symbols) {
if (character.code_point == confusable.confusable) {
this->input_ += character.size;
this->last_token_.end = this->input_;
this->last_token_.type = confusable.symbol_token_type;
this->diag_reporter_->report(Diag_Confusable_Symbol{
.confusable = this->last_token_.span(),
.confusable_name = confusable.confusable_name,
.symbol = confusable.symbol,
.symbol_name = confusable.symbol_name,
});
return;
}
}

Parsed_Identifier ident = this->parse_identifier_slow(
this->input_, this->input_, Identifier_Kind::javascript);
this->input_ = ident.after;
this->last_token_.normalized_identifier = ident.normalized;
this->last_token_.end = ident.after;
this->last_token_.type = Token_Type::identifier;
}

QLJS_WARNING_PUSH
Expand Down Expand Up @@ -2319,6 +2353,15 @@ bool Lexer::is_ascii_character(char32_t code_point) {
return code_point < 0x80;
}

bool Lexer::is_confusable_symbol_character(char32_t code_point) {
for (const Confusable_Symbol& confusable : confusable_symbols) {
if (code_point == confusable.confusable) {
return true;
}
}
return false;
}

int Lexer::newline_character_size(const Char8* input) {
if (input[0] == u8'\n' || input[0] == u8'\r') {
return 1;
Expand Down
2 changes: 2 additions & 0 deletions src/quick-lint-js/fe/lex.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,8 @@ class Lexer {
static bool is_ascii_character(Char8 code_unit);
static bool is_ascii_character(char32_t code_point);

static bool is_confusable_symbol_character(char32_t code_point);

static int newline_character_size(const Char8*);
static bool is_newline_character(char32_t code_point);

Expand Down
2 changes: 2 additions & 0 deletions src/quick-lint-js/i18n/translation-table-generated.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ const Translation_Table translation_data = {
{46, 25, 63, 57, 39, 51}, //
{0, 0, 0, 0, 0, 52}, //
{0, 0, 0, 0, 0, 27}, //
{0, 0, 0, 0, 0, 33}, //
{0, 0, 0, 61, 0, 61}, //
{50, 25, 0, 70, 0, 78}, //
{33, 21, 74, 25, 44, 21}, //
Expand Down Expand Up @@ -2402,6 +2403,7 @@ const Translation_Table translation_data = {
u8"switch statement is missing '{1}' around condition\0"
u8"switch statement needs parentheses around condition\0"
u8"this case will run instead\0"
u8"this is a {1}, not a {2} ('{3}')\0"
u8"this required parameter appears after the optional parameter\0"
u8"this tuple type is a named tuple type because at least one element has a name\0"
u8"this {0} looks fishy\0"
Expand Down
5 changes: 3 additions & 2 deletions src/quick-lint-js/i18n/translation-table-generated.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace quick_lint_js {
using namespace std::literals::string_view_literals;

constexpr std::uint32_t translation_table_locale_count = 5;
constexpr std::uint16_t translation_table_mapping_table_size = 605;
constexpr std::size_t translation_table_string_table_size = 82408;
constexpr std::uint16_t translation_table_mapping_table_size = 606;
constexpr std::size_t translation_table_string_table_size = 82441;
constexpr std::size_t translation_table_locale_table_size = 35;

QLJS_CONSTEVAL std::uint16_t translation_table_const_look_up(
Expand Down Expand Up @@ -546,6 +546,7 @@ QLJS_CONSTEVAL std::uint16_t translation_table_const_look_up(
"switch statement is missing '{1}' around condition"sv,
"switch statement needs parentheses around condition"sv,
"this case will run instead"sv,
"this is a {1}, not a {2} ('{3}')"sv,
"this required parameter appears after the optional parameter"sv,
"this tuple type is a named tuple type because at least one element has a name"sv,
"this {0} looks fishy"sv,
Expand Down
13 changes: 12 additions & 1 deletion src/quick-lint-js/i18n/translation-table-test-generated.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ struct Translated_String {
};

// clang-format off
inline const Translated_String test_translation_table[604] = {
inline const Translated_String test_translation_table[605] = {
{
"\"global-groups\" entries must be strings"_translatable,
u8"\"global-groups\" entries must be strings",
Expand Down Expand Up @@ -5748,6 +5748,17 @@ inline const Translated_String test_translation_table[604] = {
u8"this case will run instead",
},
},
{
"this is a {1}, not a {2} ('{3}')"_translatable,
u8"this is a {1}, not a {2} ('{3}')",
{
u8"this is a {1}, not a {2} ('{3}')",
u8"this is a {1}, not a {2} ('{3}')",
u8"this is a {1}, not a {2} ('{3}')",
u8"this is a {1}, not a {2} ('{3}')",
u8"this is a {1}, not a {2} ('{3}')",
},
},
{
"this required parameter appears after the optional parameter"_translatable,
u8"this required parameter appears after the optional parameter",
Expand Down
2 changes: 1 addition & 1 deletion test/quick-lint-js/diag-collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct Diag_Collector : public Diag_Reporter {
explicit Diag(Diag_Type type, const void *data);

Diag_Type type_;
alignas(std::uint64_t) std::uint8_t storage_[48];
alignas(std::uint64_t) std::uint8_t storage_[56];

friend void diag_collector_static_assertions();
friend Diag_Collector;
Expand Down
2 changes: 1 addition & 1 deletion test/quick-lint-js/diagnostic-assertion.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ struct Diagnostic_Assertion {
};

Diag_Type type = Diag_Type();
Fixed_Vector<Member, 3> members;
Fixed_Vector<Member, 4> members;

// Whether adjusted_for_escaped_characters should be called to compensate for
// control characters.
Expand Down
18 changes: 18 additions & 0 deletions test/test-lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1989,6 +1989,24 @@ TEST_F(Test_Lex, lex_symbols_separated_by_whitespace) {
{Token_Type::dot, Token_Type::dot, Token_Type::dot});
}

// https://www.unicode.org/reports/tr39/#Confusable_Detection
TEST_F(Test_Lex, confusable_symbols) {
this->check_tokens_with_errors(
u8"\u037e"_sv,
u8"^^^^^^ Diag_Confusable_Symbol.confusable"_diag
u8"{.confusable_name=Greek Question Mark}"_diag
u8"{.symbol=;}"_diag
u8"{.symbol_name=semicolon}"_diag,
{Token_Type::semicolon});
this->check_tokens_with_errors(
u8"x\u037e"_sv,
u8" ^^^^^^ Diag_Confusable_Symbol.confusable"_diag
u8"{.confusable_name=Greek Question Mark}"_diag
u8"{.symbol=;}"_diag
u8"{.symbol_name=semicolon}"_diag,
{Token_Type::identifier, Token_Type::semicolon});
}

TEST_F(Test_Lex, question_followed_by_number_is_not_question_dot) {
this->check_tokens(u8"?.3"_sv, {Token_Type::question, Token_Type::number});
}
Expand Down

0 comments on commit d09997c

Please sign in to comment.