wchar literal

#134
tyfkda · Nov 12, 2023 · 9a5e1a0 · 9a5e1a0
1 parent d20d5c1
commit 9a5e1a0
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 28 deletions.
diff --git a/src/cc/frontend/ast.h b/src/cc/frontend/ast.h
@@ -68,6 +68,7 @@ enum TokenKind {
   TK_UCHARLIT,       // unsigned char literal
   TK_ULONGLIT,       // unsigned long literal
   TK_ULLONGLIT,      // unsigned long long literal
+  TK_WCHARLIT,       // wide-char literal
   TK_FLOAT,
   TK_FLOATLIT,       // float literal
   TK_DOUBLE,

diff --git a/src/cc/frontend/lexer.c b/src/cc/frontend/lexer.c
@@ -538,9 +538,7 @@ const char *read_ident(const char *p_) {
   return (const char*)p;
 }
 
-static Token *read_char(const char **pp) {
-  const char *p = *pp;
-  const char *begin = p++;
+static const char *do_read_char(const char *p, int *result) {
   int c = *(unsigned char*)p;
   if (c == '\'')
     lex_error(p, "Empty character");
@@ -551,11 +549,49 @@ static Token *read_char(const char **pp) {
     else
       c = backslash(c, &p);
   }
-  if (*(++p) != '\'')
+#ifndef __NO_WCHAR
+  else {
+    int ucc = isutf8first(c);
+    if (ucc > 0) {
+      c &= ((1 << (8 - ucc)) - 1);
+      for (int i = 1; i < ucc; ++i) {
+        int c2 = *(unsigned char*)(++p);
+        if (!isutf8follow(c2)) {
+          lex_error(p, "Illegal byte sequence");
+        }
+        c = (c << 6) | (c2 & 0x3f);
+      }
+    }
+  }
+#endif
+  *result = c;
+  return p + 1;
+}
+
+static Token *read_char(const char **pp) {
+  const char *p = *pp;
+  const char *begin = p++;
+#ifndef __NO_WCHAR
+  bool is_wide = false;
+  if (*begin == 'L') {
+    is_wide = true;
+    assert(*p == '\'');
+    ++p;
+  }
+#endif
+
+  int c;
+  p = do_read_char(p, &c);
+  if (*p != '\'')
     lex_error(p, "Character not closed");
 
   ++p;
-  Token *tok = alloc_token(TK_CHARLIT, lexer.line, begin, p);
+  enum TokenKind kind = TK_CHARLIT;
+#ifndef __NO_WCHAR
+  if (is_wide)
+    kind = TK_WCHARLIT;
+#endif
+  Token *tok = alloc_token(kind, lexer.line, begin, p);
   tok->fixnum = c;
   *pp = p;
   return tok;
@@ -662,14 +698,15 @@ static Token *get_token(void) {
   }
 
   Token *tok = NULL;
-  const char *begin = p;
-  const char *ident_end = read_ident(p);
-  if (ident_end != NULL) {
-    const Name *name = alloc_name(begin, ident_end, false);
-    enum TokenKind kind = reserved_word(name);
-    tok = kind != TK_EOF ? alloc_token(kind, lexer.line, begin, ident_end)
-                         : alloc_ident(name, lexer.line, begin, ident_end);
-    p = ident_end;
+#ifndef __NO_WCHAR
+  if (*p == 'L' && p[1] == '\'') {
+    tok = read_char(&p);
+  } else
+#endif
+  if (*p == '\'') {
+    tok = read_char(&p);
+  } else if (*p == '"') {
+    tok = read_string(&p);
   } else if (isdigit(*p)) {
     tok = read_num(&p);
 #ifndef __NO_FLONUM
@@ -678,23 +715,29 @@ static Token *get_token(void) {
 #endif
   } else if ((tok = get_op_token(&p)) != NULL) {
     // Ok.
-  } else if (*p == '\'') {
-    tok = read_char(&p);
-  } else if (*p == '"') {
-    tok = read_string(&p);
   } else {
-    if (!for_preprocess) {
-      lex_error(p, "Unexpected character `%c'(%d)", *p, *p);
-    }
+    const char *begin = p;
+    const char *ident_end = read_ident(p);
+    if (ident_end != NULL) {
+      const Name *name = alloc_name(begin, ident_end, false);
+      enum TokenKind kind = reserved_word(name);
+      tok = kind != TK_EOF ? alloc_token(kind, lexer.line, begin, ident_end)
+                          : alloc_ident(name, lexer.line, begin, ident_end);
+      p = ident_end;
+    } else {
+      if (!for_preprocess) {
+        lex_error(p, "Unexpected character `%c'(%d)", *p, *p);
+      }
 
-    assert(*p != '\0');
-    const char *q = p + 1;
-    if (isutf8first(*p)) {
-      for (; isutf8follow(*q); ++q)
-        ;
+      assert(*p != '\0');
+      const char *q = p + 1;
+      if (isutf8first(*p)) {
+        for (; isutf8follow(*q); ++q)
+          ;
+      }
+      tok = alloc_token(PPTK_OTHERCHAR, lexer.line, p, q);
+      p = q;
     }
-    tok = alloc_token(PPTK_OTHERCHAR, lexer.line, p, q);
-    p = q;
   }
 
   assert(tok != NULL);

diff --git a/src/cc/frontend/parser_expr.c b/src/cc/frontend/parser_expr.c
@@ -813,6 +813,9 @@ static Expr *parse_prim(void) {
       {TK_UINTLIT, FX_INT, true},
       {TK_ULONGLIT, FX_LONG, true},
       {TK_ULLONGLIT, FX_LLONG, true},
+#ifndef __NO_WCHAR
+      {TK_WCHARLIT, FX_INT, true},  // TODO: Must match with target's wchar_t
+#endif
     };
     for (int i = 0, n = sizeof(TABLE) / sizeof(*TABLE); i < n; ++i) {
       if ((tok = match(TABLE[i].tk)) != NULL) {

diff --git a/src/cpp/cpp.c b/src/cpp/cpp.c
@@ -28,6 +28,9 @@ int main(int argc, char *argv[]) {
   define_macro("__NO_VLA");
   define_macro("__STDC_NO_VLA__");
 #endif
+#if defined(__NO_WCHAR)
+  define_macro("__NO_WCHAR");
+#endif
 
   enum {
     OPT_ISYSTEM = 128,

diff --git a/src/cpp/pp_parser.c b/src/cpp/pp_parser.c
@@ -174,7 +174,8 @@ static PpResult pp_prim(void) {
       (tok = pp_match(TK_UCHARLIT)) != NULL ||
       (tok = pp_match(TK_UINTLIT)) != NULL ||
       (tok = pp_match(TK_ULONGLIT)) != NULL ||
-      (tok = pp_match(TK_ULLONGLIT)) != NULL) {
+      (tok = pp_match(TK_ULLONGLIT)) != NULL ||
+      (tok = pp_match(TK_WCHARLIT)) != NULL) {
     return tok->fixnum;
   }
   //if ((tok = pp_match(TK_STR)) != NULL)

diff --git a/tests/valtest.c b/tests/valtest.c
@@ -122,6 +122,9 @@ TEST(all) {
   EXPECT("escape sequence octal", 28, '\034');
   EXPECT("escape sequence hex", 27, '\x1b');
   EXPECT("escape char in str", 19, "\023"[0]);
+#ifndef __NO_WCHAR
+  EXPECT("wide character", 0x1f600, L'😀');
+#endif
   EXPECT("+-", 21, (x=5, x+20-4));
   EXPECT("*+", 47, (x=6, 5+x*7));
   EXPECT("()", 15, (x=9, 5*(x-6)));