-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutf8.js
56 lines (53 loc) · 2.3 KB
/
utf8.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* Utf8 class: encode / decode between multi-byte Unicode characters and UTF-8 multiple */
/* single-byte character encoding (c) Chris Veness 2002-2010 */
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
var Utf8 = {}; // Utf8 namespace
/**
* Encode multi-byte Unicode string into utf-8 multiple single-byte characters
* (BMP / basic multilingual plane only)
*
* Chars in range U+0080 - U+07FF are encoded in 2 chars, U+0800 - U+FFFF in 3 chars
*
* @param {String} strUni Unicode string to be encoded as UTF-8
* @returns {String} encoded string
*/
Utf8.encode = function(strUni) {
// use regular expressions & String.replace callback function for better efficiency
// than procedural approaches
var strUtf = strUni.replace(
/[\u0080-\u07ff]/g, // U+0080 - U+07FF => 2 bytes 110yyyyy, 10zzzzzz
function(c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xc0 | cc>>6, 0x80 | cc&0x3f); }
);
strUtf = strUtf.replace(
/[\u0800-\uffff]/g, // U+0800 - U+FFFF => 3 bytes 1110xxxx, 10yyyyyy, 10zzzzzz
function(c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xe0 | cc>>12, 0x80 | cc>>6&0x3F, 0x80 | cc&0x3f); }
);
return strUtf;
}
/**
* Decode utf-8 encoded string back into multi-byte Unicode characters
*
* @param {String} strUtf UTF-8 string to be decoded back to Unicode
* @returns {String} decoded string
*/
Utf8.decode = function(strUtf) {
// note: decode 3-byte chars first as decoded 2-byte strings could appear to be 3-byte char!
var strUni = strUtf.replace(
/[\u00e0-\u00ef][\u0080-\u00bf][\u0080-\u00bf]/g, // 3-byte chars
function(c) { // (note parentheses for precence)
var cc = ((c.charCodeAt(0)&0x0f)<<12) | ((c.charCodeAt(1)&0x3f)<<6) | ( c.charCodeAt(2)&0x3f);
return String.fromCharCode(cc); }
);
strUni = strUni.replace(
/[\u00c0-\u00df][\u0080-\u00bf]/g, // 2-byte chars
function(c) { // (note parentheses for precence)
var cc = (c.charCodeAt(0)&0x1f)<<6 | c.charCodeAt(1)&0x3f;
return String.fromCharCode(cc); }
);
return strUni;
}