Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Berry add unicode encoding to string parsing #22713

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
- Berry scroll to Leds_matrix (#22693)
- HASPmota support for `tabview` (#22707)
- Berry bit-shift operators to `int64` (#22709)
- Berry add unicode encoding to string parsing

### Breaking Changed

Expand Down
35 changes: 2 additions & 33 deletions lib/libesp32/berry/src/be_jsonlib.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
********************************************************************/
#include "be_object.h"
#include "be_mem.h"
#include "be_lexer.h"
#include <string.h>
#include <math.h>

Expand Down Expand Up @@ -116,38 +117,6 @@ static const char* parser_null(bvm *vm, const char *json)
return NULL;
}

static char* load_unicode(char *dst, const char *json)
{
int ucode = 0, i = 4;
while (i--) {
int ch = *json++;
if (ch >= '0' && ch <= '9') {
ucode = (ucode << 4) | (ch - '0');
} else if (ch >= 'A' && ch <= 'F') {
ucode = (ucode << 4) | (ch - 'A' + 0x0A);
} else if (ch >= 'a' && ch <= 'f') {
ucode = (ucode << 4) | (ch - 'a' + 0x0A);
} else {
return NULL;
}
}
/* convert unicode to utf8 */
if (ucode < 0x007F) {
/* unicode: 0000 - 007F -> utf8: 0xxxxxxx */
*dst++ = (char)(ucode & 0x7F);
} else if (ucode < 0x7FF) {
/* unicode: 0080 - 07FF -> utf8: 110xxxxx 10xxxxxx */
*dst++ = (char)(((ucode >> 6) & 0x1F) | 0xC0);
*dst++ = (char)((ucode & 0x3F) | 0x80);
} else {
/* unicode: 0800 - FFFF -> utf8: 1110xxxx 10xxxxxx 10xxxxxx */
*dst++ = (char)(((ucode >> 12) & 0x0F) | 0xE0);
*dst++ = (char)(((ucode >> 6) & 0x03F) | 0x80);
*dst++ = (char)((ucode & 0x3F) | 0x80);
}
return dst;
}

static const char* parser_string(bvm *vm, const char *json)
{
if (*json == '"') {
Expand All @@ -169,7 +138,7 @@ static const char* parser_string(bvm *vm, const char *json)
case 'r': *dst++ = '\r'; break;
case 't': *dst++ = '\t'; break;
case 'u': { /* load unicode */
dst = load_unicode(dst, json);
dst = be_load_unicode(dst, json);
if (dst == NULL) {
be_free(vm, buf, len);
return NULL;
Expand Down
82 changes: 62 additions & 20 deletions lib/libesp32/berry/src/be_lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,38 @@ static int read_oct(blexer *lexer, const char *src)
return c;
}

char* be_load_unicode(char *dst, const char *src)
{
int ucode = 0, i = 4;
while (i--) {
int ch = *src++;
if (ch >= '0' && ch <= '9') {
ucode = (ucode << 4) | (ch - '0');
} else if (ch >= 'A' && ch <= 'F') {
ucode = (ucode << 4) | (ch - 'A' + 0x0A);
} else if (ch >= 'a' && ch <= 'f') {
ucode = (ucode << 4) | (ch - 'a' + 0x0A);
} else {
return NULL;
}
}
/* convert unicode to utf8 */
if (ucode < 0x007F) {
/* unicode: 0000 - 007F -> utf8: 0xxxxxxx */
*dst++ = (char)(ucode & 0x7F);
} else if (ucode < 0x7FF) {
/* unicode: 0080 - 07FF -> utf8: 110xxxxx 10xxxxxx */
*dst++ = (char)(((ucode >> 6) & 0x1F) | 0xC0);
*dst++ = (char)((ucode & 0x3F) | 0x80);
} else {
/* unicode: 0800 - FFFF -> utf8: 1110xxxx 10xxxxxx 10xxxxxx */
*dst++ = (char)(((ucode >> 12) & 0x0F) | 0xE0);
*dst++ = (char)(((ucode >> 6) & 0x03F) | 0x80);
*dst++ = (char)((ucode & 0x3F) | 0x80);
}
return dst;
}

static void tr_string(blexer *lexer)
{
char *dst, *src, *end;
Expand All @@ -215,32 +247,42 @@ static void tr_string(blexer *lexer)
be_lexerror(lexer, "unfinished string");
break;
case '\\':
switch (*src) {
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\v'; break;
case '\\': c = '\\'; break;
case '\'': c = '\''; break;
case '"': c = '"'; break;
case '?': c = '?'; break;
case 'x': c = read_hex(lexer, ++src); ++src; break;
default:
c = read_oct(lexer, src);
if (c != EOS) {
src += 2;
if (*src != 'u') {
switch (*src) {
case 'a': c = '\a'; break;
case 'b': c = '\b'; break;
case 'f': c = '\f'; break;
case 'n': c = '\n'; break;
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\v'; break;
case '\\': c = '\\'; break;
case '\'': c = '\''; break;
case '"': c = '"'; break;
case '?': c = '?'; break;
case 'x': c = read_hex(lexer, ++src); ++src; break;
default:
c = read_oct(lexer, src);
if (c != EOS) {
src += 2;
}
break;
}
++src;
*dst++ = (char)c;
} else {
/* unicode encoding, ex "\uF054" is equivalent to "\xEF\x81\x94"*/
dst = be_load_unicode(dst, src + 1);
src += 5;
if (dst == NULL) {
be_lexerror(lexer, "incorrect '\\u' encoding");
}
break;
}
++src;
break;
default:
*dst++ = (char)c;
break;
}
*dst++ = (char)c;
}
lexer->buf.len = dst - lexbuf(lexer);
}
Expand Down
1 change: 1 addition & 0 deletions lib/libesp32/berry/src/be_lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,5 +136,6 @@ int be_lexer_scan_next(blexer *lexer);
bstring* be_lexer_newstr(blexer *lexer, const char *str);
const char *be_token2str(bvm *vm, btoken *token);
const char* be_tokentype2str(btokentype type);
char* be_load_unicode(char *dst, const char *src);

#endif
21 changes: 21 additions & 0 deletions lib/libesp32/berry/tests/lexer.be
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,27 @@ check(45.1e2, 4510)
check(45.e2, 4500)
check(45.e+2, 4500)

# unicode encoding from JSON
assert(bytes().fromstring("a").tohex() == "61")
assert(bytes().fromstring("\uF054").tohex() == "EF8194")
assert(bytes().fromstring("\uF054\uF055").tohex() == "EF8194EF8195")
assert(bytes().fromstring("a\uF054b").tohex() == "61EF819462")
# 1 byte
assert(bytes().fromstring("\u0061").tohex() == "61")
# 2 bytes
assert(bytes().fromstring("\u0088").tohex() == "C288")
assert(bytes().fromstring("\u0288").tohex() == "CA88")
# 3 bytes
assert(bytes().fromstring("\u1288").tohex() == "E18A88")

assert(bytes().fromstring("\uFFFF").tohex() == "EFBFBF")

# bad unicode encoding
test_source('"\\u"', "incorrect '\\u' encoding")
test_source('"\\u1"', "incorrect '\\u' encoding")
test_source('"\\u22"', "incorrect '\\u' encoding")
test_source('"\\u333"', "incorrect '\\u' encoding")

# Ensure pathologically long numbers don't crash the lexer (or cause an buffer overflow)
assert(000000000000000000000000000000000000E0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 == 0.0);

Expand Down
Loading