From d91742e73b108ba1a3423e27114961813416de1c Mon Sep 17 00:00:00 2001 From: Eduardo Sandalo Porto Date: Tue, 10 Sep 2024 12:07:48 -0300 Subject: [PATCH] Fix UTF-8 decoding on 4-byte characters (#712) --- src/fun/builtins.bend | 2 +- tests/golden_tests/io/utf8.bend | 9 +++++++++ tests/snapshots/io__utf8.bend.snap | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 tests/golden_tests/io/utf8.bend create mode 100644 tests/snapshots/io__utf8.bend.snap diff --git a/src/fun/builtins.bend b/src/fun/builtins.bend index cca33de5..2b11ae4e 100644 --- a/src/fun/builtins.bend +++ b/src/fun/builtins.bend @@ -498,7 +498,7 @@ Utf8/decode_character (List/Cons a (List/Cons b (List/Cons c (List/Cons d rest)) } else { if (== (& a 0xF8) 0xF0) { let r = (| (<< (& a Utf8/mask4) 18) (| (<< (& b Utf8/maskx) 12) (| (<< (& c Utf8/maskx) 6) (& d Utf8/maskx)))) - (r, []) + (r, rest) } else { (Utf8/REPLACEMENT_CHARACTER, rest) } diff --git a/tests/golden_tests/io/utf8.bend b/tests/golden_tests/io/utf8.bend new file mode 100644 index 00000000..b3e16168 --- /dev/null +++ b/tests/golden_tests/io/utf8.bend @@ -0,0 +1,9 @@ +to-and-back s = (String/decode_utf8 (String/encode_utf8 s)) + +v1 = (to-and-back "hi") +v2 = (to-and-back "(位f ((位x (f (x x))) (位x (f (x x)))))") +v3 = (to-and-back "馃専") +v4 = (to-and-back "Hello 馃寧!") +v5 = (to-and-back "饟唸 饟唹 饟唺 饟唻 饟叝") + +main = [v1, v2, v3, v4, v5] diff --git a/tests/snapshots/io__utf8.bend.snap b/tests/snapshots/io__utf8.bend.snap new file mode 100644 index 00000000..f4ec8ef3 --- /dev/null +++ b/tests/snapshots/io__utf8.bend.snap @@ -0,0 +1,6 @@ +--- +source: tests/golden_tests.rs +input_file: tests/golden_tests/io/utf8.bend +--- +Strict mode: +["hi", "(位f ((位x (f (x x))) (位x (f (x x)))))", "馃専", "Hello 馃寧!", "饟唸 饟唹 饟唺 饟唻 饟叝"]