forked from FGRibreau/node-unidecode
-
Notifications
You must be signed in to change notification settings - Fork 1
/
unidecode.js
121 lines (102 loc) · 3.96 KB
/
unidecode.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/**
* Unidecode-plus takes full-range Unicode text and tries to represent it using only US-ASCII characters (i.e., the
* universally displayable characters between 0x00 and 0x7F). The representation is generally an attempt at
* transliteration -- i.e., conveying, in Roman letters, the pronunciation expressed by the text in some other writing
* system. Some of the transliterations go for matching the _shape_ of characters rather than their pronunciation, such
* as transliterating the Greek letter `ρ` (rho) as the ASCII `p`, even though it sounds more like an `r`. Various
* emojis are represented either as "ASCII art" or English text.
*
* The tables used (in data) are converted from the tables provided in the Perl libraryText::Unidecode
* (http://search.cpan.org/dist/Text-Unidecode/lib/Text/Unidecode.pm) and are distributed under the Perl license.
*
* Whereas the original JavaScript and Perl versions of Unidecode only worked the Unicode Basic Multilingual Plane
* (BMP, U+0 to U+FFFF), this version also handles transliteration of some characters beyond the BMP, like popular
* emojis.
*
* @author Kerry Shetline
*
* Based on Francois-Guillaume Ribreau's unidecode, which in turn was based on a port of unidecode for Perl.
*/
'use strict';
require('./polyfills');
var tr = {};
var codepoints;
try {
// Can we use Unicode-aware regexes?
codepoints = /[^\x00-\x7F]/gu; // jshint ignore:line
}
catch (err) {
// Nope! This mess will have to do.
codepoints = /(?:[\x80-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])/g;
}
var german = false;
var deferredSmartSpacing = false;
var smartSpacing = false;
var skipRanges;
module.exports = function unidecode(str, options) {
german = options && options.german;
deferredSmartSpacing = options && options.deferredSmartSpacing;
smartSpacing = deferredSmartSpacing || (options && options.smartSpacing);
skipRanges = options && options.skipRanges;
if (german)
str = str.replace(/([AOU])\u0308/g, '$1E').replace(/([aou])\u0308/g, '$1e');
str = str.replace(codepoints, unidecode_internal_replace);
if (!smartSpacing || deferredSmartSpacing)
return str;
else
return resolveSpacing(str);
};
function unidecode_internal_replace(ch) {
var cp = ch.codePointAt(0), ch0 = ch;
if (skipRanges) {
for (var i = 0; i < skipRanges.length; ++i) {
if (skipRanges[i][0] <= cp && cp <= skipRanges[i][1])
return ch;
}
}
var high = cp >> 8;
var row = high + (high === 0 && german ? 0.5 : 0);
var low = cp & 0xFF;
var emDash = cp === 0x2014;
// This doesn't cover all emoji, just those currently defined.
var emoji = (high === 0x1F4 || high === 0x1F6 || high === 0x1F9);
if (0x18 < high && high < 0x1E || 0xD7 < high && high < 0xF9)
return ''; // Isolated high or low surrogate
else if (high > 0xFF && !emoji) {
return '_';
}
if (!tr[row]) {
try {
tr[row] = require('./data/x' + (row < 0x10 ? '0' : '') + row.toString(16));
// I'm not sure why, but a fair number of the original data tables don't contain a full 256 items.
if (tr[row].length < 256) {
var start = tr[row].length;
tr[row].length = 256;
tr[row].fill('_', start);
}
}
catch (err) {
tr[row] = new Array(256).fill('_');
}
}
ch = tr[row][low];
if (smartSpacing && emDash)
return '\x80--\x80';
if (!smartSpacing || ch === '[?]' || ch === '_' || /^\w+$/.test(ch))
return ch;
else if (emoji)
return '\x80\x81' + ch + '\x81\x80';
else
return '\x80' + ch.trim() + '\x80';
}
function resolveSpacing(str) {
return str
.replace(/(\w)(\x80--\x80)(\w)/g, function(_, p1, _2, p3) { return p1 + ' - ' + p3; })
.replace(/\x80(?!\w)/g, "")
.replace(/\x80\x80|(\w)\x80/g, "$1\x81")
.replace(/\x80/g, "")
.replace(/^\x81+|\x81+$/g, "")
.replace(/\x81 \x81/g, " ")
.replace(/\s?\x81+/g, " ");
}
module.exports.resolveSpacing = resolveSpacing;