Skip to content

Commit

Permalink
feat: add from_utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
qazxcdswe123 committed Oct 23, 2024
1 parent 7040233 commit b239679
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 0 deletions.
2 changes: 2 additions & 0 deletions unicode/unicode.mbti
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package moonbitlang/x/unicode

// Values
fn from_utf8(Bytes) -> String!Failure

fn to_utf8(String) -> Bytes!Failure

// Types and methods
Expand Down
86 changes: 86 additions & 0 deletions unicode/utf8.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,89 @@ pub fn to_utf8(str : String) -> Bytes!Failure {
let utf8 = bytes_from_array(acc)
utf8
}

/// Decode given UTF-8 encoded bytes to a UTF16-LE string
pub fn from_utf8(utf8 : Bytes) -> String!Failure {
let mut pos = 0
let acc : Array[Int] = Array::new(capacity=utf8.length())
fn read_byte() -> Int!Failure {
if pos >= utf8.length() {
fail!("Unexpected end of input")
}
let b = utf8[pos].to_int()
pos += 1
b
}

// element in array is guaranteed to < 0xffff
fn array_to_bytes(arr : Array[Int]) -> Bytes {
let bytes : Bytes = Bytes::new(arr.length() * 2)
for i, v in arr {
bytes[2 * i] = v.to_byte()
bytes[2 * i + 1] = (v >> 8).to_byte()
}
bytes
}

while pos < utf8.length() {
let b1 = read_byte!()
let mut codepoint : Int = 0
if (b1 & 0x80) == 0x00 {
// 1-byte sequence: 0xxxxxxx
codepoint = b1
} else if (b1 & 0xE0) == 0xC0 {
// 2-byte sequence: 110xxxxx 10xxxxxx
let b2 = read_byte!()
if (b2 & 0xC0) != 0x80 {
fail!("Invalid UTF-8 sequence")
}
codepoint = ((b1 & 0x1F) << 6) | (b2 & 0x3F)
if codepoint < 0x80 {
fail!("Overlong encoding")
}
} else if (b1 & 0xF0) == 0xE0 {
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
let b2 = read_byte!()
let b3 = read_byte!()
if (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 {
fail!("Invalid UTF-8 sequence")
}
codepoint = ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
if codepoint < 0x800 {
fail!("Overlong encoding")
}
if codepoint >= 0xD800 && codepoint <= 0xDFFF {
fail!("Invalid codepoint (surrogate code point)")
}
} else if (b1 & 0xF8) == 0xF0 {
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
let b2 = read_byte!()
let b3 = read_byte!()
let b4 = read_byte!()
if (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 || (b4 & 0xC0) != 0x80 {
fail!("Invalid UTF-8 sequence")
}
codepoint = ((b1 & 0x07) << 18) |
((b2 & 0x3F) << 12) |
((b3 & 0x3F) << 6) |
(b4 & 0x3F)
if codepoint < 0x10000 || codepoint > 0x10FFFF {
fail!("Invalid codepoint")
}
} else {
fail!("Invalid UTF-8 sequence")
}

// Convert codepoint to UTF-16 code units
if codepoint <= 0xFFFF {
acc.push(codepoint)
} else {
let cp_prime = codepoint - 0x10000
let high_surrogate = 0xD800 + (cp_prime >> 10)
let low_surrogate = 0xDC00 + (cp_prime & 0x3FF)
acc.push(high_surrogate)
acc.push(low_surrogate)
}
}
Bytes::to_string(array_to_bytes(acc))
}
11 changes: 11 additions & 0 deletions unicode/utf8_test.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,14 @@ test "to_utf8" {
,
)
}

test "from utf8" {
inspect!(@unicode.from_utf8!(b"\xe2\x9c\x85"), content="✅")
inspect!("👨‍👩‍👧‍👦", content="👨\u{200d}👩\u{200d}👧\u{200d}👦")
inspect!(
@unicode.from_utf8!(
b"\xf0\x9f\x91\xa8\xe2\x80\x8d\xf0\x9f\x91\xa9\xe2\x80\x8d\xf0\x9f\x91\xa7\xe2\x80\x8d\xf0\x9f\x91\xa6",
),
content="👨\u{200d}👩\u{200d}👧\u{200d}👦",
)
}

0 comments on commit b239679

Please sign in to comment.