Skip to content

Commit

Permalink
More transliterator fixes; drop 46.0 support (#6005)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian authored Jan 15, 2025
1 parent 1bc501a commit f8ff826
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 61 deletions.
1 change: 0 additions & 1 deletion components/experimental/src/transliterate/compile/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,6 @@ where
fn parse_literal(&mut self) -> Result<String> {
let mut buf = String::new();
loop {
self.skip_whitespace();
let c = self.must_peek_char()?;
if c == Self::ESCAPE {
self.parse_escaped_char_into_buf(&mut buf)?;
Expand Down

Large diffs are not rendered by default.

22 changes: 12 additions & 10 deletions provider/data/experimental/fingerprints.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52664,8 +52664,8 @@ relativetime/short/year@1, zh-HK, 141B, 49B, 9438b480e161ff9c
relativetime/short/year@1, zh-Hant, 141B, 49B, 73fde796e1da3679
relativetime/short/year@1, zh-MO, -> zh-HK
relativetime/short/year@1, zu, 207B, 115B, 6ce1954fc2a56d7c
transliterator/rules@1, <lookup>, 4730B, 408 identifiers
transliterator/rules@1, <total>, 2079274B, 1978031B, 405 unique payloads
transliterator/rules@1, <lookup>, 4752B, 410 identifiers
transliterator/rules@1, <total>, 2101676B, 1999938B, 407 unique payloads
transliterator/rules@1, und/am-brai-t-am-ethi, 8365B, 8117B, ac45af4b1d56faa6
transliterator/rules@1, und/am-ethi-t-am-brai, 7358B, 7109B, b4561ef2c2cd7557
transliterator/rules@1, und/am-ethi-t-am-ethi-m0-geminate, 252965B, 252717B, 10d9badde599e374
Expand Down Expand Up @@ -52727,9 +52727,11 @@ transliterator/rules@1, und/be-latn-t-be-m0-bgn, 1769B, 1519B, 8b49bd0d6c5d42cb
transliterator/rules@1, und/bg-latn-t-bg-m0-bgn, 6557B, 6308B, 34218327c67a1120
transliterator/rules@1, und/blt-fonipa-t-blt, 8194B, 7947B, a15f6f37950c29cb
transliterator/rules@1, und/byn-ethi-t-byn-latn-m0-tekieali, 7544B, 7295B, 9d31a95cd1822f30
transliterator/rules@1, und/byn-ethi-t-byn-latn-m0-xaleget, 9134B, 8887B, 8d6d4f1b8d633dc6
transliterator/rules@1, und/byn-latn-t-byn-ethi-m0-tekieali, 13220B, 12972B, be0beb759c7f8e5a
transliterator/rules@1, und/byn-latn-t-byn-ethi-m0-xaleget, 13076B, 12829B, de88e38ac29710a6
transliterator/rules@1, und/ch-fonipa-t-ch, 1328B, 1079B, 66fd9575d85dea7d
transliterator/rules@1, und/chr-fonipa-t-chr, 7856B, 7608B, 53e1b06226cf5f01
transliterator/rules@1, und/chr-fonipa-t-chr, 7886B, 7638B, 1ac7b8b87f20db51
transliterator/rules@1, und/chr-t-am, 334B, 83B, d4049ffa7cb5e963
transliterator/rules@1, und/chr-t-ch, 334B, 83B, e37b6814f6cf5628
transliterator/rules@1, und/chr-t-cs, 334B, 83B, 641eb18458d8181f
Expand Down Expand Up @@ -52828,8 +52830,8 @@ transliterator/rules@1, und/mn-latn-t-mn-m0-bgn, 1554B, 1304B, ea67f16eceebcdb5
transliterator/rules@1, und/mn-latn-t-mn-m0-mns, 5414B, 5165B, 7bf0118284ae0e3c
transliterator/rules@1, und/my-fonipa-t-my, 6095B, 5847B, f11ba302887ee208
transliterator/rules@1, und/my-latn-t-my, 4328B, 4079B, 7761d4ab4d4b1bee
transliterator/rules@1, und/my-t-my-d0-zawgyi, 7468B, 7221B, 1c4009966c93b1da
transliterator/rules@1, und/my-t-my-s0-zawgyi, 5120B, 4874B, 873e921d0118e7c8
transliterator/rules@1, und/my-t-my-d0-zawgyi, 7468B, 7220B, b0c13d26c05b4ed7
transliterator/rules@1, und/my-t-my-s0-zawgyi, 5120B, 4874B, f95dca63ea7ddf0f
transliterator/rules@1, und/nl-t-d0-title, 5987B, 5737B, 15104226dd1e5f4a
transliterator/rules@1, und/nv-fonipa-t-nv, 1488B, 1238B, f3739daa7487fd91
transliterator/rules@1, und/pl-fonipa-t-pl, 2111B, 1862B, 125b84678a2d0507
Expand All @@ -52852,14 +52854,14 @@ transliterator/rules@1, und/tlh-fonipa-t-tlh, 834B, 584B, b4ef3e66dddf2f4b
transliterator/rules@1, und/tr-t-d0-lower, -> und/az-t-d0-lower
transliterator/rules@1, und/tr-t-d0-title, -> und/az-t-d0-title
transliterator/rules@1, und/tr-t-d0-upper, -> und/az-t-d0-upper
transliterator/rules@1, und/ug-fonipa-t-ug, 1200B, 950B, b284a16197783826
transliterator/rules@1, und/ug-fonipa-t-ug, 1203B, 953B, b7361a264fad0148
transliterator/rules@1, und/uk-latn-t-uk-m0-bgn, 1849B, 1599B, 376d6ba05b05eee
transliterator/rules@1, und/und-arab-t-und-beng, 481B, 230B, 2ddea780cd71e0c3
transliterator/rules@1, und/und-arab-t-und-deva, 422B, 171B, 51d977bbe18d9db0
transliterator/rules@1, und/und-arab-t-und-gujr, 476B, 225B, b6b54630928f8f51
transliterator/rules@1, und/und-arab-t-und-guru, 488B, 237B, 82c8c301fd29d746
transliterator/rules@1, und/und-arab-t-und-knda, 469B, 218B, d7148f8209f4e9f0
transliterator/rules@1, und/und-arab-t-und-latn, 5073B, 4825B, 91ac73a76902bdf7
transliterator/rules@1, und/und-arab-t-und-latn, 5076B, 4828B, ad1445c8a47c59df
transliterator/rules@1, und/und-arab-t-und-mlym, 458B, 207B, bf2decf29aab6fc4
transliterator/rules@1, und/und-arab-t-und-orya, 473B, 222B, efa3a257716abce7
transliterator/rules@1, und/und-arab-t-und-taml, 479B, 228B, 43a42c7eb444530
Expand Down Expand Up @@ -52923,7 +52925,7 @@ transliterator/rules@1, und/und-guru-t-und-telu, 464B, 213B, fc9cd1e680bf615
transliterator/rules@1, und/und-hang-t-und-latn, 577B, 327B, dc2d40ceb0851723
transliterator/rules@1, und/und-hans-t-und-hant, 74506B, 74257B, 8717f9f7c3764678
transliterator/rules@1, und/und-hant-t-und-hans, 69259B, 69011B, 12a2d9f71d32b01a
transliterator/rules@1, und/und-hebr-t-und-latn, 9798B, 9550B, fe8a7f83c81415b8
transliterator/rules@1, und/und-hebr-t-und-latn, 9816B, 9568B, 390c7461fd88ecef
transliterator/rules@1, und/und-hira-t-und-kana, 4531B, 4283B, ce4a1354a3e02c16
transliterator/rules@1, und/und-hira-t-und-latn, 635B, 386B, c5d654fd79f33c3b
transliterator/rules@1, und/und-jamo-t-und-latn, 4681B, 4432B, 2d9dc2120ba41039
Expand All @@ -52939,7 +52941,7 @@ transliterator/rules@1, und/und-knda-t-und-orya, 474B, 223B, 1c79d93e9318f49b
transliterator/rules@1, und/und-knda-t-und-taml, 480B, 229B, d3437579ee915de6
transliterator/rules@1, und/und-knda-t-und-telu, 463B, 212B, 6b6f444877aac802
transliterator/rules@1, und/und-latn-t-s0-ascii, 320B, 69B, 4036609bb3475dca
transliterator/rules@1, und/und-latn-t-und-arab, 4340B, 4092B, 6c64d1ee6bf5f0bb
transliterator/rules@1, und/und-latn-t-und-arab, 4460B, 4212B, 892a7ad89de65171
transliterator/rules@1, und/und-latn-t-und-armn, 1456B, 1206B, 8486a77c11fc8e1c
transliterator/rules@1, und/und-latn-t-und-beng, 480B, 229B, 474ce06d11edae98
transliterator/rules@1, und/und-latn-t-und-bopo, 20003B, 19754B, fa4ad6d920087cc9
Expand All @@ -52966,7 +52968,7 @@ transliterator/rules@1, und/und-latn-t-und-guru, 487B, 236B, 75fa84043f417eb9
transliterator/rules@1, und/und-latn-t-und-hang, 459B, 208B, 39b5d1250b5d7be9
transliterator/rules@1, und/und-latn-t-und-hani, 304627B, 304380B, aee0e4bade6f4d12
transliterator/rules@1, und/und-latn-t-und-hani-m0-prprname, 1613B, 1364B, 8d6f53afda23b817
transliterator/rules@1, und/und-latn-t-und-hebr, 2219B, 1970B, f61b527118667d4
transliterator/rules@1, und/und-latn-t-und-hebr, 2237B, 1988B, 4d2ec7508baa9d48
transliterator/rules@1, und/und-latn-t-und-hira, 562B, 313B, a2d1e8b1ffbe5756
transliterator/rules@1, und/und-latn-t-und-jamo, 380B, 129B, 5d489fb23fdb826d
transliterator/rules@1, und/und-latn-t-und-kana, 5249B, 5004B, 99c49df19bd734f4
Expand Down
40 changes: 4 additions & 36 deletions provider/source/src/transforms/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,6 @@ impl CldrCache {
continue;
}

if transform == "byn-Ethi-t-byn-latn-m0-xaleget" {
// Doesn't parse (backreference error)
continue;
}

if transform == "Thai-Latin" {
// References an unknown transliterator (Any-BreakInternal)
continue;
Expand All @@ -66,37 +61,10 @@ impl CldrCache {
"cldr-transforms/transforms/{}",
metadata.rules_file
))?
// Declares a sequence of Unicode sets instead of a Unicode set
.replace(
"$initialPunct = [:Ps:][:Pi:];",
"$initialPunct = [[:Ps:][:Pi:]];",
)
// I'm not sure why this errors
.replace("ə̃ {ə̃}+ → ə̃;", "")
// This does not escape the $, so the = is interpreted as a variable name
.replace(r#""$="#, r#""\$="#)
// Any-ASCII does not exist and should probably the Latin-ASCII
.replace("Any-ASCII", "Latin-ASCII")
// Non-canonical property names
.replace("block=", "Block=")
.replace("script=", "Script=")
.replace("case-ignorable:", "Case_Ignorable:")
.replace("cased:", "Cased:")
.replace("ideographic:", "Ideographic:")
// Non-canonical property values
.replace("ccc=above", "ccc=Above")
.replace("ccc=below", "ccc=Below")
.replace("UppercaseLetter:", "Uppercase_Letter:")
.replace("nonspacing mark:", "Nonspacing_Mark:")
.replace("letter:", "Letter:")
.replace("ARABIC:", "Arabic:")
.replace("arabic:", "Arabic:")
.replace("bengali:", "Bengali:")
.replace("greek:", "Greek:")
.replace("han:", "Han:")
.replace("latin:", "Latin:")
.replace("thaana:", "Thaana:")
.replace("thai:", "Thai:");
// This attempts to group the decomposed character, but erroneously uses a context (chr-chr_FONIPA)
.replace("ə̃ {ə̃}+ → ə̃;", "ə̃ ə̃+ → ə̃;")
// Back references don't work in reverse (byn-Ethi-t-byn-latn-m0-xaleget)
.replace("$1 ↔", "$1 ←");

if matches!(
metadata.direction,
Expand Down

0 comments on commit f8ff826

Please sign in to comment.