Skip to content

Commit

Permalink
upgrade icu4j to 74.2 (#13239)
Browse files Browse the repository at this point in the history
* fix regeneration, upgrade icu jar, fully regenerate sources, tests pass
* Upgrade RBBI grammar to match 74.2 (add instructions on how to do this)
* Make use of Script_Extensions property in tokenization
* document and test nfkc_scf form
* update tokenizer for improved text in UAX#24 5.2
* use indic syllablic category for myanmar tokenizer instead of relying on Gc
  • Loading branch information
rmuir committed Apr 4, 2024
1 parent fad856d commit 56d4578
Show file tree
Hide file tree
Showing 30 changed files with 24,379 additions and 25,819 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ configurations {

dependencies {
// Use a newer groovy that doesn't have illegal reflective accesses.
groovy "org.codehaus.groovy:groovy-all:3.0.12"
groovy "org.codehaus.groovy:groovy-all:3.0.21"
}

apply from: file('buildSrc/scriptDepVersions.gradle')
Expand Down
5 changes: 5 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ Improvements
implementation is the ConcurrentMergeScheduler and the Lucene99HnswVectorsFormat will use it if no other
executor is provided. (Ben Trent)

* GITHUB#13239: Upgrade icu4j to version 74.2. (Robert Muir)

* GITHUB#13202: Early terminate graph and exact searches of AbstractKnnVectorQuery to follow timeout set from
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)

Optimizations
---------------------

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "334997e985b7bc21a4ae1089d9f7b8a3efde56ad",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "eddfb929c5664b2081a20314ac1a8363c462f27c",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "7c5785fda5692269e4a8022d2d08119ace54b54e",
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "eddfb929c5664b2081a20314ac1a8363c462f27c",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ad17a3b0bc254749685ac34c3071bb0881f7b185",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535"
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java": "0c95238312bae4a7234e1aa539c09b20a8a4dc7f",
"property:icuConfig": "com.ibm.icu:icu4j:70.1"
"lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java": "723c3a253ba1470031da676c93461194e2f1f491",
"property:icuConfig": "com.ibm.icu:icu4j:74.2"
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@
* limitations under the License.
*/
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
// file version from 2022 Sep 25, Sun 07:07:02 Coordinated Universal Time
// file version from 2024 Mar 28, Thu 07:07:01 Coordinated Universal Time
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros

// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
ASCIITLD = "." (
[aA][aA][aA]
| [aA][aA][rR][pP]
| [aA][bB][aA][rR][tT][hH]
| [aA][bB][bB]
| [aA][bB][bB][oO][tT][tT]
| [aA][bB][bB][vV][iI][eE]
Expand All @@ -35,7 +34,6 @@ ASCIITLD = "." (
| [aA][cC][cC][oO][uU][nN][tT][aA][nN][tT][sS]
| [aA][cC][oO]
| [aA][cC][tT][oO][rR]
| [aA][dD][aA][cC]
| [aA][dD][sS]
| [aA][dD][uU][lL][tT]
| [aA][eE][gG]
Expand All @@ -52,7 +50,6 @@ ASCIITLD = "." (
| [aA][iI][rR][tT][eE][lL]
| [aA][kK][dD][nN]
| [aA][lL]
| [aA][lL][fF][aA][rR][oO][mM][eE][oO]
| [aA][lL][iI][bB][aA][bB][aA]
| [aA][lL][iI][pP][aA][yY]
| [aA][lL][lL][fF][iI][nN][aA][nN][zZ]
Expand Down Expand Up @@ -98,15 +95,13 @@ ASCIITLD = "." (
| [aA][uU][sS][pP][oO][sS][tT]
| [aA][uU][tT][hH][oO][rR]
| [aA][uU][tT][oO][sS]
| [aA][vV][iI][aA][nN][cC][aA]
| [aA][wW][sS]
| [aA][xX][aA]
| [aA][zZ]
| [aA][zZ][uU][rR][eE]
| [bB][aA][bB][yY]
| [bB][aA][iI][dD][uU]
| [bB][aA][nN][aA][mM][eE][xX]
| [bB][aA][nN][aA][nN][aA][rR][eE][pP][uU][bB][lL][iI][cC]
| [bB][aA][nN][dD]
| [bB][aA][nN][kK]
| [bB][aA][rR]
Expand Down Expand Up @@ -176,7 +171,6 @@ ASCIITLD = "." (
| [bB][rR][uU][sS][sS][eE][lL][sS]
| [bB][sS]
| [bB][tT]
| [bB][uU][gG][aA][tT][tT][iI]
| [bB][uU][iI][lL][dD]
| [bB][uU][iI][lL][dD][eE][rR][sS]
| [bB][uU][sS][iI][nN][eE][sS][sS]
Expand All @@ -192,7 +186,6 @@ ASCIITLD = "." (
| [cC][aA][lL][vV][iI][nN][kK][lL][eE][iI][nN]
| [cC][aA][mM][eE][rR][aA]
| [cC][aA][mM][pP]
| [cC][aA][nN][cC][eE][rR][rR][eE][sS][eE][aA][rR][cC][hH]
| [cC][aA][nN][oO][nN]
| [cC][aA][pP][eE][tT][oO][wW][nN]
| [cC][aA][pP][iI][tT][aA][lL]
Expand All @@ -212,7 +205,6 @@ ASCIITLD = "." (
| [cC][bB][aA]
| [cC][bB][nN]
| [cC][bB][rR][eE]
| [cC][bB][sS]
| [cC][cC]
| [cC][dD]
| [cC][eE][nN][tT][eE][rR]
Expand All @@ -239,7 +231,6 @@ ASCIITLD = "." (
| [cC][iI][tT][aA][dD][eE][lL]
| [cC][iI][tT][iI][cC]
| [cC][iI][tT][yY]
| [cC][iI][tT][yY][eE][aA][tT][sS]
| [cC][kK]
| [cC][lL]
| [cC][lL][aA][iI][mM][sS]
Expand All @@ -259,7 +250,6 @@ ASCIITLD = "." (
| [cC][oO][lL][lL][eE][gG][eE]
| [cC][oO][lL][oO][gG][nN][eE]
| [cC][oO][mM]
| [cC][oO][mM][cC][aA][sS][tT]
| [cC][oO][mM][mM][bB][aA][nN][kK]
| [cC][oO][mM][mM][uU][nN][iI][tT][yY]
| [cC][oO][mM][pP][aA][nN][yY]
Expand All @@ -272,7 +262,6 @@ ASCIITLD = "." (
| [cC][oO][nN][tT][aA][cC][tT]
| [cC][oO][nN][tT][rR][aA][cC][tT][oO][rR][sS]
| [cC][oO][oO][kK][iI][nN][gG]
| [cC][oO][oO][kK][iI][nN][gG][cC][hH][aA][nN][nN][eE][lL]
| [cC][oO][oO][lL]
| [cC][oO][oO][pP]
| [cC][oO][rR][sS][iI][cC][aA]
Expand Down Expand Up @@ -370,7 +359,6 @@ ASCIITLD = "." (
| [eE][sS][qQ]
| [eE][sS][tT][aA][tT][eE]
| [eE][tT]
| [eE][tT][iI][sS][aA][lL][aA][tT]
| [eE][uU][rR][oO][vV][iI][sS][iI][oO][nN]
| [eE][uU][sS]
| [eE][vV][eE][nN][tT][sS]
Expand All @@ -393,7 +381,6 @@ ASCIITLD = "." (
| [fF][eE][eE][dD][bB][aA][cC][kK]
| [fF][eE][rR][rR][aA][rR][iI]
| [fF][eE][rR][rR][eE][rR][oO]
| [fF][iI][aA][tT]
| [fF][iI][dD][eE][lL][iI][tT][yY]
| [fF][iI][dD][oO]
| [fF][iI][lL][mM]
Expand All @@ -417,7 +404,6 @@ ASCIITLD = "." (
| [fF][lL][yY]
| [fF][mM]
| [fF][oO][oO][dD]
| [fF][oO][oO][dD][nN][eE][tT][wW][oO][rR][kK]
| [fF][oO][oO][tT][bB][aA][lL][lL]
| [fF][oO][rR][dD]
| [fF][oO][rR][eE][xX]
Expand All @@ -429,7 +415,6 @@ ASCIITLD = "." (
| [fF][rR][eE][sS][eE][nN][iI][uU][sS]
| [fF][rR][lL]
| [fF][rR][oO][gG][aA][nN][sS]
| [fF][rR][oO][nN][tT][dD][oO][oO][rR]
| [fF][rR][oO][nN][tT][iI][eE][rR]
| [fF][tT][rR]
| [fF][uU][jJ][iI][tT][sS][uU]
Expand Down Expand Up @@ -492,7 +477,6 @@ ASCIITLD = "." (
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][uU][aA][rR][dD][iI][aA][nN]
| [gG][uU][cC][cC][iI]
| [gG][uU][gG][eE]
| [gG][uU][iI][dD][eE]
Expand All @@ -513,7 +497,6 @@ ASCIITLD = "." (
| [hH][eE][lL][sS][iI][nN][kK][iI]
| [hH][eE][rR][eE]
| [hH][eE][rR][mM][eE][sS]
| [hH][gG][tT][vV]
| [hH][iI][pP][hH][oO][pP]
| [hH][iI][sS][aA][mM][iI][tT][sS][uU]
| [hH][iI][tT][aA][cC][hH][iI]
Expand All @@ -534,7 +517,6 @@ ASCIITLD = "." (
| [hH][oO][sS][tT]
| [hH][oO][sS][tT][iI][nN][gG]
| [hH][oO][tT]
| [hH][oO][tT][eE][lL][eE][sS]
| [hH][oO][tT][eE][lL][sS]
| [hH][oO][tT][mM][aA][iI][lL]
| [hH][oO][uU][sS][eE]
Expand Down Expand Up @@ -616,7 +598,6 @@ ASCIITLD = "." (
| [kK][iI][aA]
| [kK][iI][dD][sS]
| [kK][iI][mM]
| [kK][iI][nN][dD][eE][rR]
| [kK][iI][nN][dD][lL][eE]
| [kK][iI][tT][cC][hH][eE][nN]
| [kK][iI][wW][iI]
Expand All @@ -638,7 +619,6 @@ ASCIITLD = "." (
| [lL][aA][mM][bB][oO][rR][gG][hH][iI][nN][iI]
| [lL][aA][mM][eE][rR]
| [lL][aA][nN][cC][aA][sS][tT][eE][rR]
| [lL][aA][nN][cC][iI][aA]
| [lL][aA][nN][dD]
| [lL][aA][nN][dD][rR][oO][vV][eE][rR]
| [lL][aA][nN][xX][eE][sS][sS]
Expand Down Expand Up @@ -669,7 +649,6 @@ ASCIITLD = "." (
| [lL][iI][mM][iI][tT][eE][dD]
| [lL][iI][mM][oO]
| [lL][iI][nN][cC][oO][lL][nN]
| [lL][iI][nN][dD][eE]
| [lL][iI][nN][kK]
| [lL][iI][pP][sS][yY]
| [lL][iI][vV][eE]
Expand All @@ -680,7 +659,6 @@ ASCIITLD = "." (
| [lL][oO][aA][nN][sS]
| [lL][oO][cC][kK][eE][rR]
| [lL][oO][cC][uU][sS]
| [lL][oO][fF][tT]
| [lL][oO][lL]
| [lL][oO][nN][dD][oO][nN]
| [lL][oO][tT][tT][eE]
Expand All @@ -697,7 +675,6 @@ ASCIITLD = "." (
| [lL][uU][xX][uU][rR][yY]
| [lL][vV]
| [lL][yY]
| [mM][aA][cC][yY][sS]
| [mM][aA][dD][rR][iI][dD]
| [mM][aA][iI][fF]
| [mM][aA][iI][sS][oO][nN]
Expand All @@ -710,7 +687,6 @@ ASCIITLD = "." (
| [mM][aA][rR][kK][eE][tT][sS]
| [mM][aA][rR][rR][iI][oO][tT][tT]
| [mM][aA][rR][sS][hH][aA][lL][lL][sS]
| [mM][aA][sS][eE][rR][aA][tT][iI]
| [mM][aA][tT][tT][eE][lL]
| [mM][bB][aA]
| [mM][cC]
Expand Down Expand Up @@ -763,7 +739,6 @@ ASCIITLD = "." (
| [mM][uU]
| [mM][uU][sS][eE][uU][mM]
| [mM][uU][sS][iI][cC]
| [mM][uU][tT][uU][aA][lL]
| [mM][vV]
| [mM][wW]
| [mM][xX]
Expand Down Expand Up @@ -798,7 +773,6 @@ ASCIITLD = "." (
| [nN][iI][sS][sS][aA][yY]
| [nN][lL]
| [nN][oO][kK][iI][aA]
| [nN][oO][rR][tT][hH][wW][eE][sS][tT][eE][rR][nN][mM][uU][tT][uU][aA][lL]
| [nN][oO][rR][tT][oO][nN]
| [nN][oO][wW]
| [nN][oO][wW][rR][uU][zZ]
Expand All @@ -816,7 +790,6 @@ ASCIITLD = "." (
| [oO][kK][iI][nN][aA][wW][aA]
| [oO][lL][aA][yY][aA][nN]
| [oO][lL][aA][yY][aA][nN][gG][rR][oO][uU][pP]
| [oO][lL][dD][nN][aA][vV][yY]
| [oO][lL][lL][oO]
| [oO][mM]
| [oO][mM][eE][gG][aA]
Expand All @@ -842,7 +815,6 @@ ASCIITLD = "." (
| [pP][aA][rR][tT][nN][eE][rR][sS]
| [pP][aA][rR][tT][sS]
| [pP][aA][rR][tT][yY]
| [pP][aA][sS][sS][aA][gG][eE][nN][sS]
| [pP][aA][yY]
| [pP][cC][cC][wW]
| [pP][eE][tT]
Expand Down Expand Up @@ -931,7 +903,6 @@ ASCIITLD = "." (
| [rR][iI][oO]
| [rR][iI][pP]
| [rR][oO]
| [rR][oO][cC][hH][eE][rR]
| [rR][oO][cC][kK][sS]
| [rR][oO][dD][eE][oO]
| [rR][oO][gG][eE][rR][sS]
Expand Down Expand Up @@ -961,7 +932,6 @@ ASCIITLD = "." (
| [sS][aA][xX][oO]
| [sS][bB][iI]
| [sS][bB][sS]
| [sS][cC][aA]
| [sS][cC][bB]
| [sS][cC][hH][aA][eE][fF][fF][lL][eE][rR]
| [sS][cC][hH][mM][iI][dD][tT]
Expand All @@ -980,7 +950,6 @@ ASCIITLD = "." (
| [sS][eE][lL][eE][cC][tT]
| [sS][eE][nN][eE][rR]
| [sS][eE][rR][vV][iI][cC][eE][sS]
| [sS][eE][sS]
| [sS][eE][vV][eE][nN]
| [sS][eE][wW]
| [sS][eE][xX][yY]
Expand All @@ -998,7 +967,6 @@ ASCIITLD = "." (
| [sS][hH][oO][pP][pP][iI][nN][gG]
| [sS][hH][oO][uU][jJ][iI]
| [sS][hH][oO][wW]
| [sS][hH][oO][wW][tT][iI][mM][eE]
| [sS][iI]
| [sS][iI][lL][kK]
| [sS][iI][nN][aA]
Expand Down Expand Up @@ -1087,7 +1055,6 @@ ASCIITLD = "." (
| [tT][iI][aA][aA]
| [tT][iI][cC][kK][eE][tT][sS]
| [tT][iI][eE][nN][dD][aA]
| [tT][iI][fF][fF][aA][nN][yY]
| [tT][iI][pP][sS]
| [tT][iI][rR][eE][sS]
| [tT][iI][rR][oO][lL]
Expand All @@ -1114,7 +1081,6 @@ ASCIITLD = "." (
| [tT][rR][aA][dD][iI][nN][gG]
| [tT][rR][aA][iI][nN][iI][nN][gG]
| [tT][rR][aA][vV][eE][lL]
| [tT][rR][aA][vV][eE][lL][cC][hH][aA][nN][nN][eE][lL]
| [tT][rR][aA][vV][eE][lL][eE][rR][sS]
| [tT][rR][aA][vV][eE][lL][eE][rR][sS][iI][nN][sS][uU][rR][aA][nN][cC][eE]
| [tT][rR][uU][sS][tT]
Expand Down Expand Up @@ -1166,14 +1132,12 @@ ASCIITLD = "." (
| [vV][lL][aA][aA][nN][dD][eE][rR][eE][nN]
| [vV][nN]
| [vV][oO][dD][kK][aA]
| [vV][oO][lL][kK][sS][wW][aA][gG][eE][nN]
| [vV][oO][lL][vV][oO]
| [vV][oO][tT][eE]
| [vV][oO][tT][iI][nN][gG]
| [vV][oO][tT][oO]
| [vV][oO][yY][aA][gG][eE]
| [vV][uU]
| [vV][uU][eE][lL][oO][sS]
| [wW][aA][lL][eE][sS]
| [wW][aA][lL][mM][aA][rR][tT]
| [wW][aA][lL][tT][eE][rR]
Expand Down Expand Up @@ -1209,7 +1173,6 @@ ASCIITLD = "." (
| [wW][tT][fF]
| [xX][bB][oO][xX]
| [xX][eE][rR][oO][xX]
| [xX][fF][iI][nN][iI][tT][yY]
| [xX][iI][hH][uU][aA][nN]
| [xX][iI][nN]
| [xX][nN]--11[bB]4[cC]3[dD]
Expand Down Expand Up @@ -1289,7 +1252,6 @@ ASCIITLD = "." (
| [xX][nN]--[jJ]1[aA][mM][hH]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][lL][qQ]480[nN]2[rR][gG]
| [xX][nN]--[jJ][lL][qQ]61[uU]9[wW]7[bB]
| [xX][nN]--[jJ][vV][rR]189[mM]
| [xX][nN]--[kK][cC][rR][xX]77[dD]1[xX]4[aA]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
Expand All @@ -1301,7 +1263,6 @@ ASCIITLD = "." (
| [xX][nN]--[mM][gG][bB][aA]3[aA]3[eE][jJ][tT]
| [xX][nN]--[mM][gG][bB][aA]3[aA]4[fF]16[aA]
| [xX][nN]--[mM][gG][bB][aA]7[cC]0[bB][bB][nN]0[aA]
| [xX][nN]--[mM][gG][bB][aA][aA][kK][cC]7[dD][vV][fF]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][bB]2[bB][dD]
| [xX][nN]--[mM][gG][bB][aA][hH]1[aA]3[hH][jJ][kK][rR][dD]
Expand Down
Loading

0 comments on commit 56d4578

Please sign in to comment.