Skip to content

Commit

Permalink
CLDR-17897 Make ConvertLanguageData Consistent (#4015)
Browse files Browse the repository at this point in the history
If we re-run ConvertLanguageData on unrelated data, it will update the order and values of some other data -- this fixes inconsistencies with the XML outputs to match expectations. The biggest change was updating values in `language_script.tsv` to demote script variations to secondary when they really are not expected. Furthermore I added explicit annotations to `country_language_population.tsv` when the writing system for a country was a variant.

Scripts ran:

 mvn package -DskipTests=true
 java -jar tools/cldr-code/target/cldr-code.jar ConvertLanguageData
 java -jar tools/cldr-code/target/cldr-code.jar GenerateLikelySubtags
  • Loading branch information
conradarcturus authored Sep 18, 2024
1 parent 90b4fa1 commit 8ac1a2f
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 27 deletions.
11 changes: 11 additions & 0 deletions common/supplemental/likelySubtags.xml
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@ not be patched by hand, as any changes made in that fashion may be lost.
<likelySubtag from="ha_CM" to="ha_Arab_CM"/> <!--Hausa‧?‧Cameroon ➡ Hausa‧Arabic‧Cameroon-->
<likelySubtag from="ha_SD" to="ha_Arab_SD"/> <!--Hausa‧?‧Sudan ➡ Hausa‧Arabic‧Sudan-->
<likelySubtag from="hak" to="hak_Hans_CN"/> <!--Hakka Chinese‧?‧? ➡ Hakka Chinese‧Simplified‧China-->
<likelySubtag from="hak_TW" to="hak_Hant_TW"/> <!--Hakka Chinese‧?‧Taiwan ➡ Hakka Chinese‧Traditional‧Taiwan-->
<likelySubtag from="hak_Hant" to="hak_Hant_TW"/> <!--Hakka Chinese‧Traditional‧? ➡ Hakka Chinese‧Traditional‧Taiwan-->
<likelySubtag from="haw" to="haw_Latn_US"/> <!--Hawaiian‧?‧? ➡ Hawaiian‧Latin‧United States-->
<likelySubtag from="haz" to="haz_Arab_AF"/> <!--Hazaragi‧?‧? ➡ Hazaragi‧Arabic‧Afghanistan-->
<likelySubtag from="he" to="he_Hebr_IL"/> <!--Hebrew‧?‧? ➡ Hebrew‧Hebrew‧Israel-->
Expand Down Expand Up @@ -434,6 +436,8 @@ not be patched by hand, as any changes made in that fashion may be lost.
<likelySubtag from="lwl" to="lwl_Thai_TH"/> <!--Eastern Lawa‧?‧? ➡ Eastern Lawa‧Thai‧Thailand-->
<likelySubtag from="lzh" to="lzh_Hans_CN"/> <!--Literary Chinese‧?‧? ➡ Literary Chinese‧Simplified‧China-->
<likelySubtag from="lzz" to="lzz_Latn_TR"/> <!--Laz‧?‧? ➡ Laz‧Latin‧Türkiye-->
<likelySubtag from="lzz_GE" to="lzz_Geor_GE"/> <!--Laz‧?‧Georgia ➡ Laz‧Georgian‧Georgia-->
<likelySubtag from="lzz_Geor" to="lzz_Geor_GE"/> <!--Laz‧Georgian‧? ➡ Laz‧Georgian‧Georgia-->
<likelySubtag from="mad" to="mad_Latn_ID"/> <!--Madurese‧?‧? ➡ Madurese‧Latin‧Indonesia-->
<likelySubtag from="maf" to="maf_Latn_CM"/> <!--Mafa‧?‧? ➡ Mafa‧Latin‧Cameroon-->
<likelySubtag from="mag" to="mag_Deva_IN"/> <!--Magahi‧?‧? ➡ Magahi‧Devanagari‧India-->
Expand Down Expand Up @@ -498,6 +502,8 @@ not be patched by hand, as any changes made in that fashion may be lost.
<likelySubtag from="mzn" to="mzn_Arab_IR"/> <!--Mazanderani‧?‧? ➡ Mazanderani‧Arabic‧Iran-->
<likelySubtag from="na" to="na_Latn_NR"/> <!--Nauru‧?‧? ➡ Nauru‧Latin‧Nauru-->
<likelySubtag from="nan" to="nan_Hans_CN"/> <!--Min Nan Chinese‧?‧? ➡ Min Nan Chinese‧Simplified‧China-->
<likelySubtag from="nan_TW" to="nan_Hant_TW"/> <!--Min Nan Chinese‧?‧Taiwan ➡ Min Nan Chinese‧Traditional‧Taiwan-->
<likelySubtag from="nan_Hant" to="nan_Hant_TW"/> <!--Min Nan Chinese‧Traditional‧? ➡ Min Nan Chinese‧Traditional‧Taiwan-->
<likelySubtag from="nap" to="nap_Latn_IT"/> <!--Neapolitan‧?‧? ➡ Neapolitan‧Latin‧Italy-->
<likelySubtag from="naq" to="naq_Latn_NA"/> <!--Nama‧?‧? ➡ Nama‧Latin‧Namibia-->
<likelySubtag from="nb" to="nb_Latn_NO"/> <!--Norwegian Bokmål‧?‧? ➡ Norwegian Bokmål‧Latin‧Norway-->
Expand Down Expand Up @@ -567,6 +573,10 @@ not be patched by hand, as any changes made in that fashion may be lost.
<likelySubtag from="pl" to="pl_Latn_PL"/> <!--Polish‧?‧? ➡ Polish‧Latin‧Poland-->
<likelySubtag from="pms" to="pms_Latn_IT"/> <!--Piedmontese‧?‧? ➡ Piedmontese‧Latin‧Italy-->
<likelySubtag from="pnt" to="pnt_Grek_GR"/> <!--Pontic‧?‧? ➡ Pontic‧Greek‧Greece-->
<likelySubtag from="pnt_RU" to="pnt_Cyrl_RU"/> <!--Pontic‧?‧Russia ➡ Pontic‧Cyrillic‧Russia-->
<likelySubtag from="pnt_TR" to="pnt_Latn_TR"/> <!--Pontic‧?‧Türkiye ➡ Pontic‧Latin‧Türkiye-->
<likelySubtag from="pnt_Cyrl" to="pnt_Cyrl_RU"/> <!--Pontic‧Cyrillic‧? ➡ Pontic‧Cyrillic‧Russia-->
<likelySubtag from="pnt_Latn" to="pnt_Latn_TR"/> <!--Pontic‧Latin‧? ➡ Pontic‧Latin‧Türkiye-->
<likelySubtag from="pon" to="pon_Latn_FM"/> <!--Pohnpeian‧?‧? ➡ Pohnpeian‧Latin‧Micronesia-->
<likelySubtag from="pqm" to="pqm_Latn_CA"/> <!--Maliseet-Passamaquoddy‧?‧? ➡ Maliseet-Passamaquoddy‧Latin‧Canada-->
<likelySubtag from="pra" to="pra_Khar_PK"/> <!--Prakrit languages‧?‧? ➡ Prakrit languages‧Kharoshthi‧Pakistan-->
Expand Down Expand Up @@ -1036,6 +1046,7 @@ not be patched by hand, as any changes made in that fashion may be lost.
<likelySubtag from="und_Ahom" to="aho_Ahom_IN"/> <!--?‧Ahom‧? ➡ Ahom‧Ahom‧India-->
<likelySubtag from="und_Arab" to="ar_Arab_EG"/> <!--?‧Arabic‧? ➡ Arabic‧Arabic‧Egypt-->
<likelySubtag from="und_Arab_AF" to="fa_Arab_AF"/> <!--?‧Arabic‧Afghanistan ➡ Persian‧Arabic‧Afghanistan-->
<likelySubtag from="und_Arab_AZ" to="az_Arab_AZ"/> <!--?‧Arabic‧Azerbaijan ➡ Azerbaijani‧Arabic‧Azerbaijan-->
<likelySubtag from="und_Arab_BN" to="ms_Arab_BN"/> <!--?‧Arabic‧Brunei ➡ Malay‧Arabic‧Brunei-->
<likelySubtag from="und_Arab_CC" to="ms_Arab_CC"/> <!--?‧Arabic‧Cocos (Keeling) Islands ➡ Malay‧Arabic‧Cocos (Keeling) Islands-->
<likelySubtag from="und_Arab_CN" to="ug_Arab_CN"/> <!--?‧Arabic‧China ➡ Uyghur‧Arabic‧China-->
Expand Down
35 changes: 24 additions & 11 deletions common/supplemental/supplementalData.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1647,7 +1647,7 @@ XXX Code for transations where no currency is involved
<language type="ha" scripts="Arab Latn"/>
<language type="ha" territories="NE NG" alt="secondary"/>
<language type="hai" scripts="Latn"/>
<language type="hak" scripts="Hans"/>
<language type="hak" scripts="Hans Hant" territories="TW"/>
<language type="hak" territories="CN" alt="secondary"/>
<language type="haw" scripts="Latn"/>
<language type="haw" territories="US" alt="secondary"/>
Expand Down Expand Up @@ -1890,7 +1890,8 @@ XXX Code for transations where no currency is involved
<language type="lv" scripts="Latn" territories="LV"/>
<language type="lwl" scripts="Thai"/>
<language type="lzh" scripts="Hans" alt="secondary"/>
<language type="lzz" scripts="Latn Geor"/>
<language type="lzz" scripts="Latn"/>
<language type="lzz" scripts="Geor" alt="secondary"/>
<language type="mad" scripts="Latn"/>
<language type="mad" territories="ID" alt="secondary"/>
<language type="maf" scripts="Latn"/>
Expand Down Expand Up @@ -1979,7 +1980,7 @@ XXX Code for transations where no currency is involved
<language type="mzn" scripts="Arab"/>
<language type="mzn" territories="IR" alt="secondary"/>
<language type="na" scripts="Latn" territories="NR"/>
<language type="nan" scripts="Hans"/>
<language type="nan" scripts="Hans Hant" territories="TW"/>
<language type="nan" territories="CN" alt="secondary"/>
<language type="nap" scripts="Latn"/>
<language type="naq" scripts="Latn"/>
Expand Down Expand Up @@ -2070,14 +2071,15 @@ XXX Code for transations where no currency is involved
<language type="peo" scripts="Xpeo" alt="secondary"/>
<language type="pfl" scripts="Latn"/>
<language type="phn" scripts="Phnx" alt="secondary"/>
<language type="pi" scripts="Deva Sinh Thai" alt="secondary"/>
<language type="pi" scripts="Deva Mymr Sinh Thai" alt="secondary"/>
<language type="pis" scripts="Latn"/>
<language type="pis" territories="SB" alt="secondary"/>
<language type="pko" scripts="Latn"/>
<language type="pl" scripts="Latn" territories="PL"/>
<language type="pl" territories="GB" alt="secondary"/>
<language type="pms" scripts="Latn"/>
<language type="pnt" scripts="Grek Cyrl Latn"/>
<language type="pnt" scripts="Grek"/>
<language type="pnt" scripts="Cyrl Latn" alt="secondary"/>
<language type="pon" scripts="Latn"/>
<language type="pon" territories="FM" alt="secondary"/>
<language type="pqm" scripts="Latn"/>
Expand Down Expand Up @@ -2277,11 +2279,12 @@ XXX Code for transations where no currency is involved
<language type="tk" scripts="Arab Cyrl Latn" territories="TM"/>
<language type="tk" territories="AF IR" alt="secondary"/>
<language type="tkl" scripts="Latn" territories="TK"/>
<language type="tkr" scripts="Latn Cyrl"/>
<language type="tkr" scripts="Latn"/>
<language type="tkr" scripts="Cyrl" alt="secondary"/>
<language type="tkt" scripts="Deva"/>
<language type="tli" scripts="Latn"/>
<language type="tly" scripts="Latn Arab Cyrl"/>
<language type="tly" territories="AZ" alt="secondary"/>
<language type="tly" scripts="Latn"/>
<language type="tly" scripts="Arab Cyrl" territories="AZ" alt="secondary"/>
<language type="tmh" scripts="Latn"/>
<language type="tmh" territories="NE" alt="secondary"/>
<language type="tn" scripts="Latn" territories="BW"/>
Expand Down Expand Up @@ -2309,8 +2312,8 @@ XXX Code for transations where no currency is involved
<language type="ttj" scripts="Latn"/>
<language type="tts" scripts="Thai"/>
<language type="tts" territories="TH" alt="secondary"/>
<language type="ttt" scripts="Latn Cyrl"/>
<language type="ttt" scripts="Arab" alt="secondary"/>
<language type="ttt" scripts="Latn"/>
<language type="ttt" scripts="Arab Cyrl" alt="secondary"/>
<language type="tum" scripts="Latn"/>
<language type="tum" territories="MW" alt="secondary"/>
<language type="tvl" scripts="Latn" territories="TV"/>
Expand Down Expand Up @@ -3072,6 +3075,7 @@ XXX Code for transations where no currency is involved
<languagePopulation type="ab" populationPercent="2.2" officialStatus="official_regional"/> <!--Abkhazian-->
<languagePopulation type="os" populationPercent="2.2" officialStatus="official_regional"/> <!--Ossetic-->
<languagePopulation type="ku" populationPercent="0.89"/> <!--Kurdish-->
<languagePopulation type="lzz_Geor" populationPercent="0.002" references="R1334"/> <!--Laz (Georgian)-->
</territory>
<territory type="GF" gdp="1551000000" literacyPercent="83" population="199509"> <!--French Guiana-->
<languagePopulation type="fr" populationPercent="77" officialStatus="official" references="R1019"/> <!--French-->
Expand Down Expand Up @@ -3999,6 +4003,7 @@ XXX Code for transations where no currency is involved
<languagePopulation type="krl" populationPercent="0.082"/> <!--Karelian-->
<languagePopulation type="lbe" populationPercent="0.078" officialStatus="official_regional"/> <!--Lak-->
<languagePopulation type="koi" populationPercent="0.045" officialStatus="official_regional"/> <!--Komi-Permyak-->
<languagePopulation type="pnt_Cyrl" populationPercent="0.04" references="R1335"/> <!--Pontic (Cyrillic)-->
<languagePopulation type="mrj" populationPercent="0.021"/> <!--Western Mari-->
<languagePopulation type="alt" populationPercent="0.014"/> <!--Southern Altai-->
<languagePopulation type="fi" populationPercent="0.012"/> <!--Finnish-->
Expand Down Expand Up @@ -4226,7 +4231,6 @@ XXX Code for transations where no currency is involved
<languagePopulation type="ku" populationPercent="5.5"/> <!--Kurdish-->
<languagePopulation type="apc" populationPercent="5.2" references="R1173"/> <!--Levantine Arabic-->
<languagePopulation type="zza" populationPercent="1.4"/> <!--Zaza-->
<languagePopulation type="kaa" populationPercent="0.1" references="R1199"/> <!--Kara-Kalpak-->
<languagePopulation type="kbd" populationPercent="0.77"/> <!--Kabardian-->
<languagePopulation type="az" populationPercent="0.74"/> <!--Azerbaijani-->
<languagePopulation type="az_Arab" populationPercent="0.65"/> <!--Azerbaijani (Arabic)-->
Expand All @@ -4235,11 +4239,13 @@ XXX Code for transations where no currency is involved
<languagePopulation type="bg" populationPercent="0.42"/> <!--Bulgarian-->
<languagePopulation type="ady" populationPercent="0.39"/> <!--Adyghe-->
<languagePopulation type="kiu" populationPercent="0.19"/> <!--Kirmanjki-->
<languagePopulation type="kaa" populationPercent="0.1" references="R1199"/> <!--Kara-Kalpak-->
<languagePopulation type="hy" populationPercent="0.056"/> <!--Armenian-->
<languagePopulation type="ka" populationPercent="0.056"/> <!--Georgian-->
<languagePopulation type="sr_Latn" writingPercent="5" populationPercent="0.028" references="R1017"/> <!--Serbian (Latin)-->
<languagePopulation type="lzz" populationPercent="0.028"/> <!--Laz-->
<languagePopulation type="sq" populationPercent="0.021"/> <!--Albanian-->
<languagePopulation type="pnt_Latn" populationPercent="0.0061" references="R1336"/> <!--Pontic (Latin)-->
<languagePopulation type="ab" populationPercent="0.0048" references="R1079"/> <!--Abkhazian-->
<languagePopulation type="el" populationPercent="0.0048"/> <!--Greek-->
<languagePopulation type="tru" populationPercent="0.0036"/> <!--Turoyo-->
Expand All @@ -4257,6 +4263,8 @@ XXX Code for transations where no currency is involved
</territory>
<territory type="TW" gdp="1143000000000" literacyPercent="96.1" population="23595300"> <!--Taiwan-->
<languagePopulation type="zh_Hant" populationPercent="95" officialStatus="official"/> <!--Chinese (Traditional)-->
<languagePopulation type="nan_Hant" populationPercent="57" officialStatus="official" references="R1219"/> <!--Min Nan Chinese (Traditional)-->
<languagePopulation type="hak_Hant" populationPercent="11" officialStatus="official" references="R1333"/> <!--Hakka Chinese (Traditional)-->
<languagePopulation type="trv" populationPercent="0.02"/> <!--Taroko-->
</territory>
<territory type="TZ" gdp="234100000000" literacyPercent="67.8" population="67462100"> <!--Tanzania-->
Expand Down Expand Up @@ -5692,6 +5700,7 @@ XXX Code for transations where no currency is involved
<reference type="R1216">This is base pop for &quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;fub&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot;&quot; lang code; ff shows as a macrolanguage</reference>
<reference type="R1217" uri="http://www.ethnologue.com/language/bkm">[missing]</reference>
<reference type="R1218" uri="http://en.wikipedia.org/wiki/Vietnamese_language">(could be higher if 2nd lang included; no data yet)</reference>
<reference type="R1219" uri="https://en.wikipedia.org/wiki/Taiwanese_Hokkien">[missing]</reference>
<reference type="R1220" uri="http://www.ethnologue.com/18/language/knf/">[missing]</reference>
<reference type="R1221" uri="https://www.cia.gov/library/publications/the-world-factbook/geos/cc.html">[missing]</reference>
<reference type="R1222" uri="http://www.ethnologue.com/show_language.asp?code=dsb">pop 7k. Figure is questionable writing pop artificially set to 5% see also http://en.wikipedia.org/wiki/Lower_Sorbian</reference>
Expand Down Expand Up @@ -5805,5 +5814,9 @@ XXX Code for transations where no currency is involved
<reference type="R1330" uri="https://en.wikipedia.org/wiki/Languages_of_the_United_Kingdom">Analyzed from 2011 UK census and other sources</reference>
<reference type="R1331" uri="https://en.wikipedia.org/wiki/Languages_of_Canada">In total 86.2% of Canadians have working knowledge of English while 29.8% have a working knowledge of French.</reference>
<reference type="R1332" uri="https://statisticsmaldives.gov.mv/statistical-release-iii-education">2014 Maldives: 98% literacy in Divehi, 75% in English</reference>
<reference type="R1333" uri="https://en.wikipedia.org/wiki/Taiwanese_Hakka">[missing]</reference>
<reference type="R1334" uri="https://en.wikipedia.org/wiki/Laz_people#cite_note-ethnologue-1">[missing]</reference>
<reference type="R1335" uri="https://en.wikipedia.org/wiki/Greeks_in_Russia_and_Ukraine#cite_ref-15">Greek population in Russia -- most ancestrally used Pontic Greek -- modern usage almost certainly has dropped off but we don't have clear statistics on current usage.</reference>
<reference type="R1336" uri="https://joshuaproject.net/people_groups/14444/TU">[missing]</reference>
</references>
</supplementalData>
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ public static void main(String[] args) throws IOException {
{"mro", "mro_Mroo_BD"},
{"mro_BD", "mro_Mroo_BD"},
{"ms_Arab", "ms_Arab_MY"},
{"nan", "nan_Hans_CN"},
{"nan_Hant", "nan_Hant_TW"},
{"nan_Hans", "nan_Hans_CN"},
{"nan_TW", "nan_Hant_TW"},
{"pap", "pap_Latn_CW"},
{"pap_Latn", "pap_Latn_CW"},
{
Expand Down Expand Up @@ -469,14 +473,9 @@ public static void main(String[] args) throws IOException {
// {"cr", "cr_Cans_CA"},
// {"hif", "hif_Latn_FJ"},
// {"gon", "gon_Telu_IN"},
// {"lzz", "lzz_Latn_TR"},
// {"lif", "lif_Deva_NP"},
// {"unx", "unx_Beng_IN"},
// {"unr", "unr_Beng_IN"},
// {"ttt", "ttt_Latn_AZ"},
// {"pnt", "pnt_Grek_GR"},
// {"tly", "tly_Latn_AZ"},
// {"tkr", "tkr_Latn_AZ"},
// {"bsq", "bsq_Bass_LR"},
// {"ccp", "ccp_Cakm_BD"},
// {"blt", "blt_Tavt_VN"},
Expand Down Expand Up @@ -505,6 +504,7 @@ public static void main(String[] args) throws IOException {

// additions for missing values from LikelySubtagsText
{"und_Arab_AF", "fa_Arab_AF"},
{"und_Arab_AZ", "az_Arab_AZ"},
{"und_Cyrl_BG", "bg_Cyrl_BG"},
{"und_Tibt_BT", "dz_Tibt_BT"},
{"und_Cyrl_BY", "be_Cyrl_BY"},
Expand All @@ -518,6 +518,7 @@ public static void main(String[] args) throws IOException {
{"und_Cyrl_RS", "sr_Cyrl_RS"},
{"und_Cyrl_TJ", "tg_Cyrl_TJ"},
{"und_Cyrl_UA", "uk_Cyrl_UA"},
{"und_Hans_TW", "zh_Hans_TW"},
{"arc_Hatr", "arc_Hatr_IQ"},
{"hnj_Hmng", "hnj_Hmng_LA"},
{"bap_Krai", "bap_Krai_IN"},
Expand Down
Loading

0 comments on commit 8ac1a2f

Please sign in to comment.