Skip to content

Commit

Permalink
feat(fe): error on more confusables like ǃ and ﴾
Browse files Browse the repository at this point in the history
Also remove some assumptions that certain symbols in expressions are
only one byte.
  • Loading branch information
strager committed Mar 3, 2024
1 parent 2990b04 commit 374801c
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 61 deletions.
6 changes: 4 additions & 2 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ Semantic Versioning.
* VS Code: You can now make quick-lint-js messages fun and insulting with the
`quick-lint-js.snarky` setting (disabled by default). (Implemented by
[vegerot][].)
* Using Greek question mark (;, U+037E) instead of a semicolon (;, U+003B) now
reports [E0457][] ("this is a Greek Question Mark, not a semicolon (';')").
* Using Greek question mark (`;`, U+037E) instead of a semicolon (`;`, U+003B)
now reports [E0457][] ("this is a Greek Question Mark, not a semicolon
(';')"). This diagnostic is also reported for similar-looking characters like
`ǃ` (which should be `!`) and `` (which should be `(`).
* TypeScript: Decorators on abstract classes are now parsed. ([#1194][])

### Fixed
Expand Down
85 changes: 34 additions & 51 deletions src/quick-lint-js/fe/expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -615,23 +615,19 @@ class Expression::Call final : public Expression {
Source_Code_Span left_paren_span, const Char8 *span_end,
std::optional<Source_Code_Span> optional_chaining_operator)
: Expression(kind),
call_left_paren_begin_(left_paren_span.begin()),
call_left_paren_(left_paren_span),
span_end_(span_end),
children_(children),
optional_chaining_operator_begin_(
optional_chaining_operator.has_value()
? optional_chaining_operator->begin()
: nullptr) {
QLJS_ASSERT(left_paren_span.size() == 1);
if (optional_chaining_operator.has_value()) {
QLJS_ASSERT(optional_chaining_operator->size() == 2);
}
}

Source_Code_Span left_paren_span() const {
return Source_Code_Span(this->call_left_paren_begin_,
this->call_left_paren_begin_ + 1);
}
Source_Code_Span left_paren_span() const { return this->call_left_paren_; }

std::optional<Source_Code_Span> optional_chaining_operator_span() const {
if (this->optional_chaining_operator_begin_ == nullptr) {
Expand All @@ -641,7 +637,7 @@ class Expression::Call final : public Expression {
this->optional_chaining_operator_begin_ + 2);
}

const Char8 *call_left_paren_begin_;
Source_Code_Span call_left_paren_;
const Char8 *span_end_;
Expression_Arena::Array_Ptr<Expression *> children_;
const Char8 *optional_chaining_operator_begin_ = nullptr;
Expand Down Expand Up @@ -857,15 +853,11 @@ class Expression::Non_Null_Assertion final : public Expression {
static constexpr Expression_Kind kind = Expression_Kind::Non_Null_Assertion;

explicit Non_Null_Assertion(Expression *child, Source_Code_Span bang_span)
: Expression(kind), bang_end_(bang_span.end()), child_(child) {
QLJS_ASSERT(same_pointers(this->bang_span(), bang_span));
}
: Expression(kind), bang_(bang_span), child_(child) {}

Source_Code_Span bang_span() const {
return Source_Code_Span(this->bang_end_ - 1, this->bang_end_);
}
Source_Code_Span bang_span() const { return this->bang_; }

const Char8 *bang_end_;
Source_Code_Span bang_;
Expression *child_;
};
static_assert(Expression_Arena::is_allocatable<Expression::Non_Null_Assertion>);
Expand Down Expand Up @@ -895,16 +887,12 @@ class Expression::Optional final : public Expression {
static constexpr Expression_Kind kind = Expression_Kind::Optional;

explicit Optional(Expression *child, Source_Code_Span question_span)
: Expression(kind), child_(child), question_end_(question_span.end()) {
QLJS_ASSERT(question_span.end() - question_span.begin() == 1);
}
: Expression(kind), child_(child), question_(question_span) {}

Source_Code_Span question_span() const {
return Source_Code_Span(this->question_end_ - 1, this->question_end_);
}
Source_Code_Span question_span() const { return this->question_; }

Expression *child_;
const Char8 *question_end_;
Source_Code_Span question_;
};
static_assert(Expression_Arena::is_allocatable<Expression::Optional>);

Expand All @@ -924,25 +912,26 @@ class Expression::Paren_Empty final : public Expression {
public:
static constexpr Expression_Kind kind = Expression_Kind::Paren_Empty;

explicit Paren_Empty(Source_Code_Span span) : Expression(kind), span_(span) {}
explicit Paren_Empty(Source_Code_Span left_paren_span,
Source_Code_Span right_paren_span)
: Expression(kind),
left_paren_(left_paren_span),
right_paren_(right_paren_span) {}

Source_Code_Span left_paren_span() const {
return Source_Code_Span(this->span_.begin(), this->span_.begin() + 1);
}
Source_Code_Span left_paren_span() const { return this->left_paren_; }

Source_Code_Span right_paren_span() const {
return Source_Code_Span(this->span_.end() - 1, this->span_.end());
}
Source_Code_Span right_paren_span() const { return this->right_paren_; }

void report_missing_expression_error(Diag_Reporter *reporter) {
reporter->report(Diag_Missing_Expression_Between_Parentheses{
.left_paren_to_right_paren = this->span_,
.left_paren_to_right_paren = this->span(),
.left_paren = this->left_paren_span(),
.right_paren = this->right_paren_span(),
});
}

Source_Code_Span span_;
Source_Code_Span left_paren_;
Source_Code_Span right_paren_;
};
static_assert(Expression_Arena::is_allocatable<Expression::Paren_Empty>);

Expand Down Expand Up @@ -1072,16 +1061,12 @@ class Expression::Trailing_Comma final : public Expression {

explicit Trailing_Comma(Expression_Arena::Array_Ptr<Expression *> children,
Source_Code_Span comma_span)
: Expression(kind), children_(children), comma_end_(comma_span.end()) {
QLJS_ASSERT(comma_span.end() == comma_span.begin() + 1);
}
: Expression(kind), children_(children), comma_(comma_span) {}

Source_Code_Span comma_span() const {
return Source_Code_Span(this->comma_end_ - 1, this->comma_end_);
}
Source_Code_Span comma_span() const { return this->comma_; }

Expression_Arena::Array_Ptr<Expression *> children_;
const Char8 *comma_end_;
Source_Code_Span comma_;
};

class Expression::Type_Annotated final : public Expression {
Expand All @@ -1093,23 +1078,18 @@ class Expression::Type_Annotated final : public Expression {
const Char8 *span_end)
: Expression(kind),
child_(child),
colon_(colon_span.begin()),
colon_(colon_span),
type_visits_(std::move(type_visits)),
span_end_(span_end) {
QLJS_ASSERT(*colon_span.begin() == u8':');
QLJS_ASSERT(colon_span.size() == 1);
}
span_end_(span_end) {}

Source_Code_Span colon_span() const {
return Source_Code_Span(this->colon_, this->colon_ + 1);
}
Source_Code_Span colon_span() const { return this->colon_; }

void visit_type_annotation(Parse_Visitor_Base &v) {
std::move(this->type_visits_).move_into(v);
}

Expression *child_;
const Char8 *colon_;
Source_Code_Span colon_;
Buffering_Visitor type_visits_{nullptr};
const Char8 *span_end_;
};
Expand Down Expand Up @@ -1451,19 +1431,22 @@ inline Source_Code_Span Expression::span() const {
case Expression_Kind::Non_Null_Assertion: {
auto *assertion = expression_cast<const Non_Null_Assertion *>(this);
return Source_Code_Span(assertion->child_->span().begin(),
assertion->bang_end_);
assertion->bang_.end());
}
case Expression_Kind::Object:
return expression_cast<const Object *>(this)->span_;
case Expression_Kind::Optional: {
auto *optional = expression_cast<const Expression::Optional *>(this);
return Source_Code_Span(optional->child_->span().begin(),
optional->question_end_);
optional->question_.end());
}
case Expression_Kind::Paren:
return expression_cast<const Paren *>(this)->span_;
case Expression_Kind::Paren_Empty:
return expression_cast<const Paren_Empty *>(this)->span_;
case Expression_Kind::Paren_Empty: {
auto *paren_empty = expression_cast<const Paren_Empty *>(this);
return Source_Code_Span(paren_empty->left_paren_.begin(),
paren_empty->right_paren_.end());
}
case Expression_Kind::Private_Variable:
return expression_cast<const Private_Variable *>(this)
->variable_identifier_.span();
Expand All @@ -1489,7 +1472,7 @@ inline Source_Code_Span Expression::span() const {
case Expression_Kind::Trailing_Comma: {
auto *comma = expression_cast<const Trailing_Comma *>(this);
return Source_Code_Span(comma->children_.front()->span().begin(),
comma->comma_end_);
comma->comma_.end());
}
case Expression_Kind::Type_Annotated: {
auto *annotated = expression_cast<const Type_Annotated *>(this);
Expand Down
107 changes: 102 additions & 5 deletions src/quick-lint-js/fe/lex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,113 @@ constexpr char32_t right_double_quote = U'\u201d';

struct Confusable_Symbol {
char32_t confusable;
Char8 confusable_name[20];
Char8 confusable_name[51];
Char8 symbol;
Char8 symbol_name[20];
Char8 symbol_name[21];
Token_Type symbol_token_type;
};

Confusable_Symbol confusable_symbols[] = {
{0x037e, u8"Greek Question Mark", u8';', u8"semicolon",
Token_Type::semicolon},
// TODO(strager): Add more.
// clang-format off
{ 0x037e, u8"Greek Question Mark", u8';', u8"semicolon", Token_Type::semicolon},

{ 0x02d0, u8"Modifier Letter Triangular Colon", u8':', u8"colon", Token_Type::colon},
{ 0x02f8, u8"Modifier Letter Raised Colon", u8':', u8"colon", Token_Type::colon},
{ 0x0589, u8"Armenian Full Stop", u8':', u8"colon", Token_Type::colon},
{ 0x05c3, u8"Hebrew Punctuation Sof Pasuq", u8':', u8"colon", Token_Type::colon},
{ 0x0703, u8"Syriac Supralinear Colon", u8':', u8"colon", Token_Type::colon},
{ 0x0704, u8"Syriac Sublinear Colon", u8':', u8"colon", Token_Type::colon},
{ 0x0903, u8"Devanagari Sign Visarga", u8':', u8"colon", Token_Type::colon},
{ 0x0a83, u8"Gujarati Sign Visarga", u8':', u8"colon", Token_Type::colon},
{ 0x16ec, u8"Runic Multiple Punctuation", u8':', u8"colon", Token_Type::colon},
{ 0x1803, u8"Mongolian Full Stop", u8':', u8"colon", Token_Type::colon},
{ 0x1809, u8"Mongolian Manchu Full Stop", u8':', u8"colon", Token_Type::colon},
{ 0x205a, u8"Two Dot Punctuation", u8':', u8"colon", Token_Type::colon},
{ 0x2236, u8"Ratio", u8':', u8"colon", Token_Type::colon},
{ 0xa4fd, u8"Lisu Letter Tone Mya Jeu", u8':', u8"colon", Token_Type::colon},
{ 0xa789, u8"Modifier Letter Colon", u8':', u8"colon", Token_Type::colon},
{ 0xfe30, u8"Presentation Form For Vertical Two Dot Leader", u8':', u8"colon", Token_Type::colon},
{ 0xff1a, u8"Fullwidth Colon", u8':', u8"colon", Token_Type::colon},

{ 0x00b8, u8"Cedilla", u8',', u8"comma", Token_Type::comma},
{ 0x060d, u8"Arabic Date Separator", u8',', u8"comma", Token_Type::comma},
{ 0x066b, u8"Arabic Decimal Separator", u8',', u8"comma", Token_Type::comma},
{ 0x201a, u8"Single Low-9 Quotation Mark", u8',', u8"comma", Token_Type::comma},
{ 0xa4f9, u8"Lisu Letter Tone Na Po", u8',', u8"comma", Token_Type::comma},

{ 0x01c3, u8"Latin Letter Retroflex Click", u8'!', u8"exclamation mark", Token_Type::bang},
{ 0x2d51, u8"Tifinagh Letter Tuareg Yang", u8'!', u8"exclamation mark", Token_Type::bang},
{ 0xff01, u8"Fullwidth Exclamation Mark", u8'!', u8"exclamation mark", Token_Type::bang},

// TODO(strager): Also match symbols like "․․․".
{ 0x0660, u8"Arabic-Indic Digit Zero", u8'.', u8"dot", Token_Type::dot},
{ 0x06f0, u8"Extended Arabic-Indic Digit Zero", u8'.', u8"dot", Token_Type::dot},
{ 0x0701, u8"Syriac Supralinear Full Stop", u8'.', u8"dot", Token_Type::dot},
{ 0x0702, u8"Syriac Sublinear Full Stop", u8'.', u8"dot", Token_Type::dot},
{ 0x2024, u8"One Dot Leader", u8'.', u8"dot", Token_Type::dot},
{ 0xa4f8, u8"Lisu Letter Tone Mya Ti", u8'.', u8"dot", Token_Type::dot},
{ 0xa60e, u8"Vai Full Stop", u8'.', u8"dot", Token_Type::dot},
{0x10a50, u8"Kharoshthi Punctuation Dot", u8'.', u8"dot", Token_Type::dot},
{0x1d16d, u8"Musical Symbol Combining Augmentation Dot", u8'.', u8"dot", Token_Type::dot},

// NOTE(strager): We diverge from Unicode here. Unicode considers a few of these as parentheses.
{ 0x2772, u8"Light Left Tortoise Shell Bracket Ornament", u8'(', u8"left square bracket", Token_Type::left_square},
{ 0x2773, u8"Light Right Tortoise Shell Bracket Ornament", u8')', u8"right square bracket", Token_Type::right_square},
{ 0x3014, u8"Left Tortoise Shell Bracket", u8'(', u8"left square bracket", Token_Type::left_square},
{ 0x3015, u8"Right Tortoise Shell Bracket", u8')', u8"right square bracket", Token_Type::right_square},
{ 0xff3b, u8"Fullwidth Left Square Bracket", u8'(', u8"left square bracket", Token_Type::left_square},
{ 0xff3d, u8"Fullwidth Right Square Bracket", u8')', u8"right square bracket", Token_Type::right_square},

{ 0x2768, u8"Medium Left Parenthesis Ornament", u8'(', u8"left parenthesis", Token_Type::left_paren},
{ 0x2769, u8"Medium Right Parenthesis Ornament", u8')', u8"right parenthesis", Token_Type::right_paren},
{ 0xfd3e, u8"Ornate Left Parenthesis", u8'(', u8"left parenthesis", Token_Type::left_paren},
{ 0xfd3f, u8"Ornate Right Parenthesis", u8')', u8"right parenthesis", Token_Type::right_paren},

{ 0x2774, u8"Medium Left Curly Bracket Ornament", u8'{', u8"left curly bracket", Token_Type::left_curly},
{ 0x2775, u8"Medium Right Curly Bracket Ornament", u8'}', u8"right curly bracket", Token_Type::right_curly},
{0x1d114, u8"Musical Symbol Brace", u8'{', u8"left curly bracket", Token_Type::left_curly},

// TODO(strager): Also match symbols like "ꝸ=" and "᐀᐀".
// NOTE(strager): 0x0294 is legal in identifiers.
{ 0x0294, u8"Latin Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question},
{ 0x0241, u8"Latin Capital Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question},
{ 0x097d, u8"Devanagari Letter Glottal Stop", u8'?', u8"question mark", Token_Type::question},
{ 0x13ae, u8"Cherokee Letter He", u8'?', u8"question mark", Token_Type::question},
{ 0xa6eb, u8"Bamum Letter Ntuu", u8'?', u8"question mark", Token_Type::question},

{ 0xa778, u8"Latin Small Letter Um", u8'&', u8"ampersand", Token_Type::ampersand},

{ 0x066d, u8"Arabic Five Pointed Star", u8'*', u8"asterisk", Token_Type::star},
{ 0x204e, u8"Low Asterisk", u8'*', u8"asterisk", Token_Type::star},
{ 0x2217, u8"Asterisk Operator", u8'*', u8"asterisk", Token_Type::star},
{0x1031f, u8"Old Italic Letter Ess", u8'*', u8"asterisk", Token_Type::star},

{ 0x02c4, u8"Modifier Letter Up Arrowhead", u8'^', u8"circumflex", Token_Type::circumflex},
{ 0x02c6, u8"Modifier Letter Circumflex Accent", u8'^', u8"circumflex", Token_Type::circumflex},

{ 0x02c2, u8"Modifier Letter Left Arrowhead", u8'<', u8"less than", Token_Type::less},
{ 0x1438, u8"Canadian Syllabics Pa", u8'<', u8"less than", Token_Type::less},
{ 0x16b2, u8"Runic Letter Kauna", u8'<', u8"less than", Token_Type::less},
{ 0x2039, u8"Single Left-Pointing Angle Quotation Mark", u8'<', u8"less than", Token_Type::less},
{ 0x276e, u8"Heavy Left-Pointing Angle Quotation Mark Ornament", u8'<', u8"less than", Token_Type::less},
{0x1d236, u8"Greek Instrumental Notation Symbol-40", u8'<', u8"less than", Token_Type::less},

{ 0x02c3, u8"Modifier Letter Right Arrowhead", u8'>', u8"greater than", Token_Type::greater},
{ 0x1433, u8"Canadian Syllabics Po", u8'>', u8"greater than", Token_Type::greater},
{ 0x203a, u8"Single Right-Pointing Angle Quotation Mark", u8'>', u8"greater than", Token_Type::greater},
{ 0x276f, u8"Heavy Right-Pointing Angle Quotation Mark Ornament", u8'>', u8"greater than", Token_Type::greater},
{0x16f3f, u8"Miao Letter Archaic Zza", u8'>', u8"greater than", Token_Type::greater},
{0x1d237, u8"Greek Instrumental Notation Symbol-42", u8'>', u8"greater than", Token_Type::greater},

{ 0x02dc, u8"Small Tilde", u8'~', u8"tilde", Token_Type::tilde},
{ 0x1fc0, u8"Greek Perispomeni", u8'~', u8"tilde", Token_Type::tilde},
{ 0x2053, u8"Swung Dash", u8'~', u8"tilde", Token_Type::tilde},
{ 0x223c, u8"Tilde Operator", u8'~', u8"tilde", Token_Type::tilde},

{ 0x1400, u8"Canadian Syllabics Hyphen", u8'=', u8"equals", Token_Type::equal},
{ 0x2e40, u8"Double Hyphen", u8'=', u8"equals", Token_Type::equal},
{ 0x30a0, u8"Katakana-Hiragana Double Hyphen", u8'=', u8"equals", Token_Type::equal},
{ 0xa4ff, u8"Lisu Punctuation Full Stop", u8'=', u8"equals", Token_Type::equal},
};

bool look_up_in_unicode_table(const std::uint8_t* table, std::size_t table_size,
Expand Down
6 changes: 3 additions & 3 deletions src/quick-lint-js/fe/parse-expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -563,8 +563,8 @@ Expression* Parser::parse_primary_expression(Parse_Visitor_Base& v,
// () => {}
Source_Code_Span right_paren_span = this->peek().span();
this->skip();
return this->make_expression<Expression::Paren_Empty>(
Source_Code_Span(left_paren_span.begin(), right_paren_span.end()));
return this->make_expression<Expression::Paren_Empty>(left_paren_span,
right_paren_span);
}

// (x) => {}
Expand Down Expand Up @@ -2581,7 +2581,7 @@ Expression* Parser::parse_arrow_function_expression_remainder(
paren_empty->report_missing_expression_error(this->diag_reporter_);
} else {
// () => {}
parameter_list_begin = paren_empty->span_.begin();
parameter_list_begin = paren_empty->left_paren_.begin();
}
break;
}
Expand Down
Loading

0 comments on commit 374801c

Please sign in to comment.