feat: add from_utf8

moonbitlang · Oct 23, 2024 · b239679 · b239679
1 parent 7040233
commit b239679
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 0 deletions.
diff --git a/unicode/unicode.mbti b/unicode/unicode.mbti
@@ -1,6 +1,8 @@
 package moonbitlang/x/unicode
 
 // Values
+fn from_utf8(Bytes) -> String!Failure
+
 fn to_utf8(String) -> Bytes!Failure
 
 // Types and methods

diff --git a/unicode/utf8.mbt b/unicode/utf8.mbt
@@ -65,3 +65,89 @@ pub fn to_utf8(str : String) -> Bytes!Failure {
   let utf8 = bytes_from_array(acc)
   utf8
 }
+
+/// Decode given UTF-8 encoded bytes to a UTF16-LE string
+pub fn from_utf8(utf8 : Bytes) -> String!Failure {
+  let mut pos = 0
+  let acc : Array[Int] = Array::new(capacity=utf8.length())
+  fn read_byte() -> Int!Failure {
+    if pos >= utf8.length() {
+      fail!("Unexpected end of input")
+    }
+    let b = utf8[pos].to_int()
+    pos += 1
+    b
+  }
+
+  // element in array is guaranteed to < 0xffff
+  fn array_to_bytes(arr : Array[Int]) -> Bytes {
+    let bytes : Bytes = Bytes::new(arr.length() * 2)
+    for i, v in arr {
+      bytes[2 * i] = v.to_byte()
+      bytes[2 * i + 1] = (v >> 8).to_byte()
+    }
+    bytes
+  }
+
+  while pos < utf8.length() {
+    let b1 = read_byte!()
+    let mut codepoint : Int = 0
+    if (b1 & 0x80) == 0x00 {
+      // 1-byte sequence: 0xxxxxxx
+      codepoint = b1
+    } else if (b1 & 0xE0) == 0xC0 {
+      // 2-byte sequence: 110xxxxx 10xxxxxx
+      let b2 = read_byte!()
+      if (b2 & 0xC0) != 0x80 {
+        fail!("Invalid UTF-8 sequence")
+      }
+      codepoint = ((b1 & 0x1F) << 6) | (b2 & 0x3F)
+      if codepoint < 0x80 {
+        fail!("Overlong encoding")
+      }
+    } else if (b1 & 0xF0) == 0xE0 {
+      // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+      let b2 = read_byte!()
+      let b3 = read_byte!()
+      if (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 {
+        fail!("Invalid UTF-8 sequence")
+      }
+      codepoint = ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
+      if codepoint < 0x800 {
+        fail!("Overlong encoding")
+      }
+      if codepoint >= 0xD800 && codepoint <= 0xDFFF {
+        fail!("Invalid codepoint (surrogate code point)")
+      }
+    } else if (b1 & 0xF8) == 0xF0 {
+      // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      let b2 = read_byte!()
+      let b3 = read_byte!()
+      let b4 = read_byte!()
+      if (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 || (b4 & 0xC0) != 0x80 {
+        fail!("Invalid UTF-8 sequence")
+      }
+      codepoint = ((b1 & 0x07) << 18) |
+        ((b2 & 0x3F) << 12) |
+        ((b3 & 0x3F) << 6) |
+        (b4 & 0x3F)
+      if codepoint < 0x10000 || codepoint > 0x10FFFF {
+        fail!("Invalid codepoint")
+      }
+    } else {
+      fail!("Invalid UTF-8 sequence")
+    }
+
+    // Convert codepoint to UTF-16 code units
+    if codepoint <= 0xFFFF {
+      acc.push(codepoint)
+    } else {
+      let cp_prime = codepoint - 0x10000
+      let high_surrogate = 0xD800 + (cp_prime >> 10)
+      let low_surrogate = 0xDC00 + (cp_prime & 0x3FF)
+      acc.push(high_surrogate)
+      acc.push(low_surrogate)
+    }
+  }
+  Bytes::to_string(array_to_bytes(acc))
+}
diff --git a/unicode/utf8_test.mbt b/unicode/utf8_test.mbt
@@ -12,3 +12,14 @@ test "to_utf8" {
     ,
   )
 }
+
+test "from utf8" {
+  inspect!(@unicode.from_utf8!(b"\xe2\x9c\x85"), content="✅")
+  inspect!("👨‍👩‍👧‍👦", content="👨\u{200d}👩\u{200d}👧\u{200d}👦")
+  inspect!(
+    @unicode.from_utf8!(
+      b"\xf0\x9f\x91\xa8\xe2\x80\x8d\xf0\x9f\x91\xa9\xe2\x80\x8d\xf0\x9f\x91\xa7\xe2\x80\x8d\xf0\x9f\x91\xa6",
+    ),
+    content="👨\u{200d}👩\u{200d}👧\u{200d}👦",
+  )
+}