Skip to content

Commit

Permalink
wchar literal
Browse files Browse the repository at this point in the history
  • Loading branch information
tyfkda committed Nov 12, 2023
1 parent d20d5c1 commit 9a5e1a0
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 28 deletions.
1 change: 1 addition & 0 deletions src/cc/frontend/ast.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ enum TokenKind {
TK_UCHARLIT, // unsigned char literal
TK_ULONGLIT, // unsigned long literal
TK_ULLONGLIT, // unsigned long long literal
TK_WCHARLIT, // wide-char literal
TK_FLOAT,
TK_FLOATLIT, // float literal
TK_DOUBLE,
Expand Down
97 changes: 70 additions & 27 deletions src/cc/frontend/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -538,9 +538,7 @@ const char *read_ident(const char *p_) {
return (const char*)p;
}

static Token *read_char(const char **pp) {
const char *p = *pp;
const char *begin = p++;
static const char *do_read_char(const char *p, int *result) {
int c = *(unsigned char*)p;
if (c == '\'')
lex_error(p, "Empty character");
Expand All @@ -551,11 +549,49 @@ static Token *read_char(const char **pp) {
else
c = backslash(c, &p);
}
if (*(++p) != '\'')
#ifndef __NO_WCHAR
else {
int ucc = isutf8first(c);
if (ucc > 0) {
c &= ((1 << (8 - ucc)) - 1);
for (int i = 1; i < ucc; ++i) {
int c2 = *(unsigned char*)(++p);
if (!isutf8follow(c2)) {
lex_error(p, "Illegal byte sequence");
}
c = (c << 6) | (c2 & 0x3f);
}
}
}
#endif
*result = c;
return p + 1;
}

static Token *read_char(const char **pp) {
const char *p = *pp;
const char *begin = p++;
#ifndef __NO_WCHAR
bool is_wide = false;
if (*begin == 'L') {
is_wide = true;
assert(*p == '\'');
++p;
}
#endif

int c;
p = do_read_char(p, &c);
if (*p != '\'')
lex_error(p, "Character not closed");

++p;
Token *tok = alloc_token(TK_CHARLIT, lexer.line, begin, p);
enum TokenKind kind = TK_CHARLIT;
#ifndef __NO_WCHAR
if (is_wide)
kind = TK_WCHARLIT;
#endif
Token *tok = alloc_token(kind, lexer.line, begin, p);
tok->fixnum = c;
*pp = p;
return tok;
Expand Down Expand Up @@ -662,14 +698,15 @@ static Token *get_token(void) {
}

Token *tok = NULL;
const char *begin = p;
const char *ident_end = read_ident(p);
if (ident_end != NULL) {
const Name *name = alloc_name(begin, ident_end, false);
enum TokenKind kind = reserved_word(name);
tok = kind != TK_EOF ? alloc_token(kind, lexer.line, begin, ident_end)
: alloc_ident(name, lexer.line, begin, ident_end);
p = ident_end;
#ifndef __NO_WCHAR
if (*p == 'L' && p[1] == '\'') {
tok = read_char(&p);
} else
#endif
if (*p == '\'') {
tok = read_char(&p);
} else if (*p == '"') {
tok = read_string(&p);
} else if (isdigit(*p)) {
tok = read_num(&p);
#ifndef __NO_FLONUM
Expand All @@ -678,23 +715,29 @@ static Token *get_token(void) {
#endif
} else if ((tok = get_op_token(&p)) != NULL) {
// Ok.
} else if (*p == '\'') {
tok = read_char(&p);
} else if (*p == '"') {
tok = read_string(&p);
} else {
if (!for_preprocess) {
lex_error(p, "Unexpected character `%c'(%d)", *p, *p);
}
const char *begin = p;
const char *ident_end = read_ident(p);
if (ident_end != NULL) {
const Name *name = alloc_name(begin, ident_end, false);
enum TokenKind kind = reserved_word(name);
tok = kind != TK_EOF ? alloc_token(kind, lexer.line, begin, ident_end)
: alloc_ident(name, lexer.line, begin, ident_end);
p = ident_end;
} else {
if (!for_preprocess) {
lex_error(p, "Unexpected character `%c'(%d)", *p, *p);
}

assert(*p != '\0');
const char *q = p + 1;
if (isutf8first(*p)) {
for (; isutf8follow(*q); ++q)
;
assert(*p != '\0');
const char *q = p + 1;
if (isutf8first(*p)) {
for (; isutf8follow(*q); ++q)
;
}
tok = alloc_token(PPTK_OTHERCHAR, lexer.line, p, q);
p = q;
}
tok = alloc_token(PPTK_OTHERCHAR, lexer.line, p, q);
p = q;
}

assert(tok != NULL);
Expand Down
3 changes: 3 additions & 0 deletions src/cc/frontend/parser_expr.c
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,9 @@ static Expr *parse_prim(void) {
{TK_UINTLIT, FX_INT, true},
{TK_ULONGLIT, FX_LONG, true},
{TK_ULLONGLIT, FX_LLONG, true},
#ifndef __NO_WCHAR
{TK_WCHARLIT, FX_INT, true}, // TODO: Must match with target's wchar_t
#endif
};
for (int i = 0, n = sizeof(TABLE) / sizeof(*TABLE); i < n; ++i) {
if ((tok = match(TABLE[i].tk)) != NULL) {
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/cpp.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ int main(int argc, char *argv[]) {
define_macro("__NO_VLA");
define_macro("__STDC_NO_VLA__");
#endif
#if defined(__NO_WCHAR)
define_macro("__NO_WCHAR");
#endif

enum {
OPT_ISYSTEM = 128,
Expand Down
3 changes: 2 additions & 1 deletion src/cpp/pp_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ static PpResult pp_prim(void) {
(tok = pp_match(TK_UCHARLIT)) != NULL ||
(tok = pp_match(TK_UINTLIT)) != NULL ||
(tok = pp_match(TK_ULONGLIT)) != NULL ||
(tok = pp_match(TK_ULLONGLIT)) != NULL) {
(tok = pp_match(TK_ULLONGLIT)) != NULL ||
(tok = pp_match(TK_WCHARLIT)) != NULL) {
return tok->fixnum;
}
//if ((tok = pp_match(TK_STR)) != NULL)
Expand Down
3 changes: 3 additions & 0 deletions tests/valtest.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ TEST(all) {
EXPECT("escape sequence octal", 28, '\034');
EXPECT("escape sequence hex", 27, '\x1b');
EXPECT("escape char in str", 19, "\023"[0]);
#ifndef __NO_WCHAR
EXPECT("wide character", 0x1f600, L'😀');
#endif
EXPECT("+-", 21, (x=5, x+20-4));
EXPECT("*+", 47, (x=6, 5+x*7));
EXPECT("()", 15, (x=9, 5*(x-6)));
Expand Down

0 comments on commit 9a5e1a0

Please sign in to comment.