Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text encode decode #1645

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 39 additions & 123 deletions runtime/js/mlBytes.js
Original file line number Diff line number Diff line change
Expand Up @@ -82,115 +82,6 @@ function caml_subarray_to_jsbytes(a, i, len) {
return s;
}

//Provides: caml_utf8_of_utf16
function caml_utf8_of_utf16(s) {
for (var b = "", t = b, c, d, i = 0, l = s.length; i < l; i++) {
c = s.charCodeAt(i);
if (c < 0x80) {
for (var j = i + 1; j < l && (c = s.charCodeAt(j)) < 0x80; j++);
if (j - i > 512) {
t.slice(0, 1);
b += t;
t = "";
b += s.slice(i, j);
} else t += s.slice(i, j);
if (j === l) break;
i = j;
}
if (c < 0x800) {
t += String.fromCharCode(0xc0 | (c >> 6));
t += String.fromCharCode(0x80 | (c & 0x3f));
} else if (c < 0xd800 || c >= 0xdfff) {
t += String.fromCharCode(
0xe0 | (c >> 12),
0x80 | ((c >> 6) & 0x3f),
0x80 | (c & 0x3f),
);
} else if (
c >= 0xdbff ||
i + 1 === l ||
(d = s.charCodeAt(i + 1)) < 0xdc00 ||
d > 0xdfff
) {
// Unmatched surrogate pair, replaced by \ufffd (replacement character)
t += "\xef\xbf\xbd";
} else {
i++;
c = (c << 10) + d - 0x35fdc00;
t += String.fromCharCode(
0xf0 | (c >> 18),
0x80 | ((c >> 12) & 0x3f),
0x80 | ((c >> 6) & 0x3f),
0x80 | (c & 0x3f),
);
}
if (t.length > 1024) {
t.slice(0, 1);
b += t;
t = "";
}
}
return b + t;
}

//Provides: caml_utf16_of_utf8
function caml_utf16_of_utf8(s) {
for (var b = "", t = "", c, c1, c2, v, i = 0, l = s.length; i < l; i++) {
c1 = s.charCodeAt(i);
if (c1 < 0x80) {
for (var j = i + 1; j < l && (c1 = s.charCodeAt(j)) < 0x80; j++);
if (j - i > 512) {
t.slice(0, 1);
b += t;
t = "";
b += s.slice(i, j);
} else t += s.slice(i, j);
if (j === l) break;
i = j;
}
v = 1;
if (++i < l && ((c2 = s.charCodeAt(i)) & -64) === 128) {
c = c2 + (c1 << 6);
if (c1 < 0xe0) {
v = c - 0x3080;
if (v < 0x80) v = 1;
} else {
v = 2;
if (++i < l && ((c2 = s.charCodeAt(i)) & -64) === 128) {
c = c2 + (c << 6);
if (c1 < 0xf0) {
v = c - 0xe2080;
if (v < 0x800 || (v >= 0xd7ff && v < 0xe000)) v = 2;
} else {
v = 3;
if (
++i < l &&
((c2 = s.charCodeAt(i)) & -64) === 128 &&
c1 < 0xf5
) {
v = c2 - 0x3c82080 + (c << 6);
if (v < 0x10000 || v > 0x10ffff) v = 3;
}
}
}
}
}
if (v < 4) {
// Invalid sequence
i -= v;
t += "\ufffd";
} else if (v > 0xffff)
t += String.fromCharCode(0xd7c0 + (v >> 10), 0xdc00 + (v & 0x3ff));
else t += String.fromCharCode(v);
if (t.length > 1024) {
t.slice(0, 1);
b += t;
t = "";
}
}
return b + t;
}

//Provides: jsoo_is_ascii
function jsoo_is_ascii(s) {
// The regular expression gets better at around this point for all browsers
Expand Down Expand Up @@ -387,17 +278,28 @@ function caml_bytes_set(s, i, c) {
return caml_bytes_unsafe_set(s, i, c);
}

//Provides: jsoo_text_encoder
var jsoo_text_encoder = new TextEncoder();

//Provides: jsoo_text_decoder
var jsoo_text_decoder = new TextDecoder();

//Provides: caml_bytes_of_utf16_jsstring
//Requires: jsoo_is_ascii, caml_utf8_of_utf16, MlBytes
//Requires: MlBytes, jsoo_text_encoder
//Requires: jsoo_is_ascii
function caml_bytes_of_utf16_jsstring(s) {
var tag = 9 /* BYTES | ASCII */;
if (!jsoo_is_ascii(s))
(tag = 8) /* BYTES | NOT_ASCII */, (s = caml_utf8_of_utf16(s));
return new MlBytes(tag, s, s.length);
if (jsoo_is_ascii(s)) {
return new MlBytes(9, s, s.length);
} else {
var a = jsoo_text_encoder.encode(s);
return new MlBytes(4, a, a.length);
}
}

//Provides: MlBytes
//Requires: caml_convert_string_to_bytes, jsoo_is_ascii, caml_utf16_of_utf8
//Requires: caml_convert_string_to_bytes, jsoo_is_ascii
//Requires: caml_uint8_array_of_bytes
//Requires: jsoo_text_decoder
function MlBytes(tag, contents, length) {
this.t = tag;
this.c = contents;
Expand All @@ -420,9 +322,9 @@ MlBytes.prototype.toString = function () {
}
};
MlBytes.prototype.toUtf16 = function () {
var r = this.toString();
if (this.t === 9) return r;
return caml_utf16_of_utf8(r);
if (this.t === 9) return this.c;
Copy link
Preview

Copilot AI Dec 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method toUtf16 should handle ASCII and non-ASCII cases consistently. The previous implementation used caml_utf16_of_utf8 for non-ASCII strings, which may handle surrogate pairs and invalid sequences differently than TextDecoder.

Suggested change
if (this.t === 9) return this.c;
return caml_utf16_of_utf8(r);

Copilot is powered by AI, so mistakes are possible. Review output carefully before use.

Positive Feedback
Negative Feedback

Provide additional feedback

Please help us improve GitHub Copilot by sharing more details about this comment.

Please select one or more of the options
var a = caml_uint8_array_of_bytes(this);
return jsoo_text_decoder.decode(a);
};
MlBytes.prototype.slice = function () {
var content = this.t === 4 ? this.c.slice() : this.c;
Expand Down Expand Up @@ -728,20 +630,35 @@ function caml_jsbytes_of_string(x) {
return x;
}

//Provides: jsoo_text_decoder_buff
var jsoo_text_decoder_buff = new ArrayBuffer(1024);

//Provides: caml_jsstring_of_string const
//Requires: jsoo_is_ascii, caml_utf16_of_utf8
//Requires: jsoo_is_ascii
//Requires: jsoo_text_decoder
//Requires: jsoo_text_decoder_buff
//If: js-string
function caml_jsstring_of_string(s) {
if (jsoo_is_ascii(s)) return s;
return caml_utf16_of_utf8(s);
var a =
s.length <= jsoo_text_decoder_buff.length
? new Uint8Array(jsoo_text_decoder_buff, 0, s.length)
: new Uint8Array(s.length);
for (var i = 0; i < s.length; i++) {
a[i] = s.charCodeAt(i);
}
return jsoo_text_decoder.decode(a);
}

//Provides: caml_string_of_jsstring const
//Requires: jsoo_is_ascii, caml_utf8_of_utf16, caml_string_of_jsbytes
//Requires: caml_string_of_array
//Requires: jsoo_text_encoder
//Requires: jsoo_is_ascii, caml_string_of_jsbytes
//If: js-string
function caml_string_of_jsstring(s) {
if (jsoo_is_ascii(s)) return caml_string_of_jsbytes(s);
else return caml_string_of_jsbytes(caml_utf8_of_utf16(s));
var a = jsoo_text_encoder.encode(s);
return caml_string_of_array(a);
}

//Provides: caml_bytes_of_jsbytes const
Expand Down Expand Up @@ -861,7 +778,6 @@ function caml_ml_bytes_content(s) {
}

//Provides: caml_is_ml_string
//Requires: jsoo_is_ascii
//If: js-string
function caml_is_ml_string(s) {
// biome-ignore lint/suspicious/noControlCharactersInRegex: expected
Expand Down
Loading