From 9a5e1a0467055d0bdcc3bb1a9e3288468e546eea Mon Sep 17 00:00:00 2001 From: tyfkda Date: Sun, 12 Nov 2023 09:32:49 +0900 Subject: [PATCH] wchar literal #134 --- src/cc/frontend/ast.h | 1 + src/cc/frontend/lexer.c | 97 +++++++++++++++++++++++++---------- src/cc/frontend/parser_expr.c | 3 ++ src/cpp/cpp.c | 3 ++ src/cpp/pp_parser.c | 3 +- tests/valtest.c | 3 ++ 6 files changed, 82 insertions(+), 28 deletions(-) diff --git a/src/cc/frontend/ast.h b/src/cc/frontend/ast.h index 9bfe7a616..1fe700a09 100644 --- a/src/cc/frontend/ast.h +++ b/src/cc/frontend/ast.h @@ -68,6 +68,7 @@ enum TokenKind { TK_UCHARLIT, // unsigned char literal TK_ULONGLIT, // unsigned long literal TK_ULLONGLIT, // unsigned long long literal + TK_WCHARLIT, // wide-char literal TK_FLOAT, TK_FLOATLIT, // float literal TK_DOUBLE, diff --git a/src/cc/frontend/lexer.c b/src/cc/frontend/lexer.c index dcbafb67f..19d65e972 100644 --- a/src/cc/frontend/lexer.c +++ b/src/cc/frontend/lexer.c @@ -538,9 +538,7 @@ const char *read_ident(const char *p_) { return (const char*)p; } -static Token *read_char(const char **pp) { - const char *p = *pp; - const char *begin = p++; +static const char *do_read_char(const char *p, int *result) { int c = *(unsigned char*)p; if (c == '\'') lex_error(p, "Empty character"); @@ -551,11 +549,49 @@ static Token *read_char(const char **pp) { else c = backslash(c, &p); } - if (*(++p) != '\'') +#ifndef __NO_WCHAR + else { + int ucc = isutf8first(c); + if (ucc > 0) { + c &= ((1 << (8 - ucc)) - 1); + for (int i = 1; i < ucc; ++i) { + int c2 = *(unsigned char*)(++p); + if (!isutf8follow(c2)) { + lex_error(p, "Illegal byte sequence"); + } + c = (c << 6) | (c2 & 0x3f); + } + } + } +#endif + *result = c; + return p + 1; +} + +static Token *read_char(const char **pp) { + const char *p = *pp; + const char *begin = p++; +#ifndef __NO_WCHAR + bool is_wide = false; + if (*begin == 'L') { + is_wide = true; + assert(*p == '\''); + ++p; + } +#endif + + int c; + p = do_read_char(p, &c); + if (*p != '\'') lex_error(p, "Character not closed"); ++p; - Token *tok = alloc_token(TK_CHARLIT, lexer.line, begin, p); + enum TokenKind kind = TK_CHARLIT; +#ifndef __NO_WCHAR + if (is_wide) + kind = TK_WCHARLIT; +#endif + Token *tok = alloc_token(kind, lexer.line, begin, p); tok->fixnum = c; *pp = p; return tok; @@ -662,14 +698,15 @@ static Token *get_token(void) { } Token *tok = NULL; - const char *begin = p; - const char *ident_end = read_ident(p); - if (ident_end != NULL) { - const Name *name = alloc_name(begin, ident_end, false); - enum TokenKind kind = reserved_word(name); - tok = kind != TK_EOF ? alloc_token(kind, lexer.line, begin, ident_end) - : alloc_ident(name, lexer.line, begin, ident_end); - p = ident_end; +#ifndef __NO_WCHAR + if (*p == 'L' && p[1] == '\'') { + tok = read_char(&p); + } else +#endif + if (*p == '\'') { + tok = read_char(&p); + } else if (*p == '"') { + tok = read_string(&p); } else if (isdigit(*p)) { tok = read_num(&p); #ifndef __NO_FLONUM @@ -678,23 +715,29 @@ static Token *get_token(void) { #endif } else if ((tok = get_op_token(&p)) != NULL) { // Ok. - } else if (*p == '\'') { - tok = read_char(&p); - } else if (*p == '"') { - tok = read_string(&p); } else { - if (!for_preprocess) { - lex_error(p, "Unexpected character `%c'(%d)", *p, *p); - } + const char *begin = p; + const char *ident_end = read_ident(p); + if (ident_end != NULL) { + const Name *name = alloc_name(begin, ident_end, false); + enum TokenKind kind = reserved_word(name); + tok = kind != TK_EOF ? alloc_token(kind, lexer.line, begin, ident_end) + : alloc_ident(name, lexer.line, begin, ident_end); + p = ident_end; + } else { + if (!for_preprocess) { + lex_error(p, "Unexpected character `%c'(%d)", *p, *p); + } - assert(*p != '\0'); - const char *q = p + 1; - if (isutf8first(*p)) { - for (; isutf8follow(*q); ++q) - ; + assert(*p != '\0'); + const char *q = p + 1; + if (isutf8first(*p)) { + for (; isutf8follow(*q); ++q) + ; + } + tok = alloc_token(PPTK_OTHERCHAR, lexer.line, p, q); + p = q; } - tok = alloc_token(PPTK_OTHERCHAR, lexer.line, p, q); - p = q; } assert(tok != NULL); diff --git a/src/cc/frontend/parser_expr.c b/src/cc/frontend/parser_expr.c index dd0cc7b92..6325add0c 100644 --- a/src/cc/frontend/parser_expr.c +++ b/src/cc/frontend/parser_expr.c @@ -813,6 +813,9 @@ static Expr *parse_prim(void) { {TK_UINTLIT, FX_INT, true}, {TK_ULONGLIT, FX_LONG, true}, {TK_ULLONGLIT, FX_LLONG, true}, +#ifndef __NO_WCHAR + {TK_WCHARLIT, FX_INT, true}, // TODO: Must match with target's wchar_t +#endif }; for (int i = 0, n = sizeof(TABLE) / sizeof(*TABLE); i < n; ++i) { if ((tok = match(TABLE[i].tk)) != NULL) { diff --git a/src/cpp/cpp.c b/src/cpp/cpp.c index 856aea1c7..dbb7aeb31 100644 --- a/src/cpp/cpp.c +++ b/src/cpp/cpp.c @@ -28,6 +28,9 @@ int main(int argc, char *argv[]) { define_macro("__NO_VLA"); define_macro("__STDC_NO_VLA__"); #endif +#if defined(__NO_WCHAR) + define_macro("__NO_WCHAR"); +#endif enum { OPT_ISYSTEM = 128, diff --git a/src/cpp/pp_parser.c b/src/cpp/pp_parser.c index 34e2b4cfd..8778bea16 100644 --- a/src/cpp/pp_parser.c +++ b/src/cpp/pp_parser.c @@ -174,7 +174,8 @@ static PpResult pp_prim(void) { (tok = pp_match(TK_UCHARLIT)) != NULL || (tok = pp_match(TK_UINTLIT)) != NULL || (tok = pp_match(TK_ULONGLIT)) != NULL || - (tok = pp_match(TK_ULLONGLIT)) != NULL) { + (tok = pp_match(TK_ULLONGLIT)) != NULL || + (tok = pp_match(TK_WCHARLIT)) != NULL) { return tok->fixnum; } //if ((tok = pp_match(TK_STR)) != NULL) diff --git a/tests/valtest.c b/tests/valtest.c index 4ae423773..29cdb01e0 100644 --- a/tests/valtest.c +++ b/tests/valtest.c @@ -122,6 +122,9 @@ TEST(all) { EXPECT("escape sequence octal", 28, '\034'); EXPECT("escape sequence hex", 27, '\x1b'); EXPECT("escape char in str", 19, "\023"[0]); +#ifndef __NO_WCHAR + EXPECT("wide character", 0x1f600, L'😀'); +#endif EXPECT("+-", 21, (x=5, x+20-4)); EXPECT("*+", 47, (x=6, 5+x*7)); EXPECT("()", 15, (x=9, 5*(x-6)));