Skip to content

Commit

Permalink
remove control characters in the default nmt_* normalizers
Browse files Browse the repository at this point in the history
  • Loading branch information
taku910 committed Jan 10, 2019
1 parent a938a66 commit 18c337f
Show file tree
Hide file tree
Showing 5 changed files with 46,729 additions and 46,618 deletions.
34 changes: 33 additions & 1 deletion data/nmt_nfkc.tsv
Original file line number Diff line number Diff line change
@@ -1,7 +1,34 @@
1 #  =>
2 #  =>
3 #  =>
4 #  =>
5 #  =>
6 #  =>
7 #  =>
8 # =>
9 20 # =>
A 20 # =>
C 20 # =>
B # =>
C 20 # =>
D 20 # =>
E #  =>
F #  =>
10 #  =>
11 #  =>
12 #  =>
13 #  =>
14 #  =>
15 #  =>
16 #  =>
17 #  =>
18 #  =>
19 #  =>
1A #  =>
1B #  =>
1C #  =>
1D #  =>
1E #  =>
1F #  =>
3C 338 226E # ≮ => ≮
3D 338 2260 # ≠ => ≠
3E 338 226F # ≯ => ≯
Expand Down Expand Up @@ -591,6 +618,9 @@ D 20 # =>
7A 323 1E93 # ẓ => ẓ
7A 331 1E95 # ẕ => ẕ
7A 341 17A # ź => ź
7F #  =>
8F #  =>
9F # Ÿ =>
A0 20 #   =>
A8 20 308 # ¨ => ̈
AA 61 # ª => a
Expand Down Expand Up @@ -57232,6 +57262,8 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
2009 20 #   =>
200A 20 #   =>
200B 20 # ​ =>
200C 20 # ‌ =>
200D 20 # ‍ =>
200E 20 # ‎ =>
200F 20 # ‏ =>
2011 2010 # ‑ => ‐
Expand Down
34 changes: 33 additions & 1 deletion data/nmt_nfkc_cf.tsv
Original file line number Diff line number Diff line change
@@ -1,7 +1,34 @@
1 #  =>
2 #  =>
3 #  =>
4 #  =>
5 #  =>
6 #  =>
7 #  =>
8 # =>
9 20 # =>
A 20 # =>
C 20 # =>
B # =>
C 20 # =>
D 20 # =>
E #  =>
F #  =>
10 #  =>
11 #  =>
12 #  =>
13 #  =>
14 #  =>
15 #  =>
16 #  =>
17 #  =>
18 #  =>
19 #  =>
1A #  =>
1B #  =>
1C #  =>
1D #  =>
1E #  =>
1F #  =>
3C 338 226E # ≮ => ≮
3D 338 2260 # ≠ => ≠
3E 338 226F # ≯ => ≯
Expand Down Expand Up @@ -617,6 +644,9 @@ D 20 # =>
7A 323 1E93 # ẓ => ẓ
7A 331 1E95 # ẕ => ẕ
7A 341 17A # ź => ź
7F #  =>
8F #  =>
9F # Ÿ =>
A0 20 #   =>
A8 20 308 # ¨ => ̈
AA 61 # ª => a
Expand Down Expand Up @@ -57949,6 +57979,8 @@ FB9 F90 FB5 # ྐྵ => ྐྵ
2009 20 #   =>
200A 20 #   =>
200B 20 # ​ =>
200C 20 # ‌ =>
200D 20 # ‍ =>
200E 20 # ‎ =>
200F 20 # ‏ =>
2011 2010 # ‑ => ‐
Expand Down
48 changes: 43 additions & 5 deletions src/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ Builder::Chars Normalize(const Builder::CharsMap &chars_map,
normalized.push_back(src[i]);
++i;
} else {
CHECK(!it->second.empty());
std::copy(it->second.begin(), it->second.end(),
std::back_inserter(normalized));
i += it->first.size();
Expand Down Expand Up @@ -357,10 +356,10 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
RETURN_IF_ERROR(Builder::BuildNFKCMap(&nfkc_map));

// Other code points considered as whitespace.
nfkc_map[{0x9}] = {0x20}; // TAB
nfkc_map[{0xA}] = {0x20}; // LINE FEED
nfkc_map[{0xC}] = {0x20}; // FORM FEED
nfkc_map[{0xD}] = {0x20}; // CARRIAGE RETURN
nfkc_map[{0x0009}] = {0x20}; // TAB
nfkc_map[{0x000A}] = {0x20}; // LINE FEED
nfkc_map[{0x000C}] = {0x20}; // FORM FEED
nfkc_map[{0x000D}] = {0x20}; // CARRIAGE RETURN
nfkc_map[{0x1680}] = {0x20}; // OGHAM SPACE MARK
nfkc_map[{0x200B}] = {0x20}; // ZERO WIDTH SPACE
nfkc_map[{0x200E}] = {0x20}; // LEFT-TO-RIGHT MARK
Expand All @@ -370,6 +369,42 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
nfkc_map[{0x2581}] = {0x20}; // LOWER ONE EIGHT BLOCK
nfkc_map[{0xFEFF}] = {0x20}; // ZERO WIDTH NO-BREAK
nfkc_map[{0xFFFD}] = {0x20}; // REPLACEMENT CHARACTER
nfkc_map[{0x200C}] = {0x20}; // ZERO WIDTH NON-JOINER
nfkc_map[{0x200D}] = {0x20}; // ZERO WIDTH JOINER

// Ascii Control characters
nfkc_map[{0x0001}] = {};
nfkc_map[{0x0002}] = {};
nfkc_map[{0x0003}] = {};
nfkc_map[{0x0004}] = {};
nfkc_map[{0x0005}] = {};
nfkc_map[{0x0006}] = {};
nfkc_map[{0x0007}] = {};
nfkc_map[{0x0008}] = {};
nfkc_map[{0x000B}] = {};
nfkc_map[{0x000E}] = {};
nfkc_map[{0x000F}] = {};
nfkc_map[{0x0010}] = {};
nfkc_map[{0x0011}] = {};
nfkc_map[{0x0012}] = {};
nfkc_map[{0x0013}] = {};
nfkc_map[{0x0014}] = {};
nfkc_map[{0x0015}] = {};
nfkc_map[{0x0016}] = {};
nfkc_map[{0x0017}] = {};
nfkc_map[{0x0018}] = {};
nfkc_map[{0x0019}] = {};
nfkc_map[{0x001A}] = {};
nfkc_map[{0x001B}] = {};
nfkc_map[{0x001C}] = {};
nfkc_map[{0x001D}] = {};
nfkc_map[{0x001E}] = {};
nfkc_map[{0x001F}] = {};

// <control-007F>..<control-009F>
nfkc_map[{0x007F}] = {};
nfkc_map[{0x008F}] = {};
nfkc_map[{0x009F}] = {};

// Do not normalize FULL_WIDTH TILDE, since FULL_WIDTH TILDE
// and HALF_WIDTH TILDE are used differently in Japanese.
Expand Down Expand Up @@ -497,6 +532,9 @@ util::Status Builder::SaveCharsMap(absl::string_view filename,
string_util::Join(trg, " ") + "\t# " +
string_util::UnicodeTextToUTF8(c.first) + " => " +
string_util::UnicodeTextToUTF8(c.second);
line = string_util::StringReplace(line, "\b", " ", true);
line = string_util::StringReplace(line, "\v", " ", true);
line = string_util::StringReplace(line, "\f", " ", true);
line = string_util::StringReplace(line, "\n", " ", true);
line = string_util::StringReplace(line, "\r", " ", true);
output->WriteLine(line);
Expand Down
Loading

0 comments on commit 18c337f

Please sign in to comment.