From 44057d0c0d24083e85da772c0d566c28214d7a79 Mon Sep 17 00:00:00 2001 From: Saeid Darvish Date: Wed, 21 Feb 2024 21:55:31 +0100 Subject: [PATCH 1/7] create persian.sbl --- algorithms/persian.sbl | 117 +++++++++++++++++++++++++++++++++++++++++ python/modules.txt | 62 ++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 algorithms/persian.sbl create mode 100644 python/modules.txt diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl new file mode 100644 index 00000000..ed537997 --- /dev/null +++ b/algorithms/persian.sbl @@ -0,0 +1,117 @@ +/* + * Persian Stemming Algorithm + * Author: https://saeiddrv.com +*/ + +stringdef alef '{U+0627}' +stringdef be '{U+0628}' +stringdef pe '{U+067E}' +stringdef te '{U+062A}' +stringdef se '{U+0633}' +stringdef jim '{U+062C}' +stringdef che '{U+0686}' +stringdef he '{U+0647}' +stringdef khe '{U+062E}' +stringdef dal '{U+062F}' +stringdef zal '{U+0630}' +stringdef re '{U+0631}' +stringdef ze '{U+0632}' +stringdef zhe '{U+0698}' +stringdef sin '{U+0633}' +stringdef shin '{U+0634}' +stringdef sad '{U+0635}' +stringdef zad '{U+0636}' +stringdef ta '{U+0637}' +stringdef za '{U+0638}' +stringdef ain '{U+0639}' +stringdef ghain '{U+063A}' +stringdef fe '{U+0641}' +stringdef ghaf '{U+0642}' +stringdef kaf '{U+06A9}' +stringdef gaf '{U+06AF}' +stringdef lam '{U+0644}' +stringdef mim '{U+0645}' +stringdef nun '{U+0646}' +stringdef vav '{U+0648}' +stringdef heh '{U+0647}' +stringdef ye '{U+06CC}' + + +routines ( + Normalize + Prefix + Suffix_Noun + Suffix_Verb + Suffix_Adjective + Post_Normalize +) + + +externals ( stem ) + + +groupings () + + +define Normalize as ( + do repeat ( + [substring] among ( + '{U+06A9}' '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf + '{U+06AF}' '{U+06A9}' ( <- '{gaf}' ) // Normalize Gaf + '{U+06CC}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye + '{U+0647}' '{U+0629}' ( <- '{he}' ) // Normalize Heh + '{U+0627}' '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef + '{U+0648}' '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above + '{U+06CC}' '{U+0626}' ( <- '{ye}' ) // Normalize Ye and Ye with Hamza above + ) + ) +) + + +define Prefix as ( + [substring] among ( + '{be}{alef}{ze}' (delete) // baaz + '{be}{ye}' (delete) // bii + '{be}{re}' (delete) // bar + '{pe}{ye}{shin}' (delete) // pish + '{pe}{re}' (delete) // por + '{pe}{sin}' (delete) // pas + '{dal}{re}' (delete) // dar + '{ze}{ye}{re}' (delete) // ziir + '{sin}{re}' (delete) // sar + '{fe}{re}{alef}' (delete) // faraa + '{he}{mim}' (delete) // ham + '{nun}{alef}' (delete) // naa + ) +) + + +define Suffix_Noun as ( + [substring] among ( + '{he}{alef}' (delete) // haa + '{alef}{nun}' (delete) // aan + '{alef}{te}' (delete) // aat + ) +) + +define Suffix_Verb as ( + [substring] among ( + // Past tense + '{ye}{mim}' '{ye}{ye}' '{ye}{ye}{dal}' '{nun}{dal}' (delete) + // Present tense + '{mim}' '{ye}' '{dal}' '{ye}{mim}' '{ye}{dal}' '{nun}{dal}' (delete) + ) +) + +define Suffix_Adjective as ( + [substring] among ( + '{te}{re}' (delete) // tar (comparative) + '{te}{re}{ye}{nun}' (delete) // tarin (superlative) + ) +) + +define stem as ( + do Normalize + do Prefix + do Suffix_Noun or Suffix_Verb or Suffix_Adjective +) diff --git a/python/modules.txt b/python/modules.txt new file mode 100644 index 00000000..91df6bae --- /dev/null +++ b/python/modules.txt @@ -0,0 +1,62 @@ +# This file contains a list of stemmers to include in the distribution. +# The format is a set of space separated lines - on each line: +# First item is name of stemmer. +# Second item is comma separated list of character sets. +# Third item is comma separated list of names to refer to the stemmer by. +# +# Lines starting with a #, or blank lines, are ignored. + +# List all the main algorithms for each language, in UTF-8, and also with +# the most commonly used encoding. + +arabic UTF_8 arabic,ar,ara +armenian UTF_8 armenian,hy,hye,arm +basque UTF_8,ISO_8859_1 basque,eu,eus,baq +catalan UTF_8,ISO_8859_1 catalan,ca,cat +danish UTF_8,ISO_8859_1 danish,da,dan +dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld +english UTF_8,ISO_8859_1 english,en,eng +estonian UTF_8 estonian,et,est +finnish UTF_8,ISO_8859_1 finnish,fi,fin +french UTF_8,ISO_8859_1 french,fr,fre,fra +german UTF_8,ISO_8859_1 german,de,ger,deu +greek UTF_8 greek,el,gre,ell +hindi UTF_8 hindi,hi,hin +hungarian UTF_8,ISO_8859_2 hungarian,hu,hun +indonesian UTF_8,ISO_8859_1 indonesian,id,ind +irish UTF_8,ISO_8859_1 irish,ga,gle +italian UTF_8,ISO_8859_1 italian,it,ita +lithuanian UTF_8 lithuanian,lt,lit +nepali UTF_8 nepali,ne,nep +norwegian UTF_8,ISO_8859_1 norwegian,no,nor +persian UTF_8 persian,fa,fas,pers +portuguese UTF_8,ISO_8859_1 portuguese,pt,por +romanian UTF_8 romanian,ro,rum,ron +russian UTF_8,KOI8_R russian,ru,rus +serbian UTF_8 serbian,sr,srp +spanish UTF_8,ISO_8859_1 spanish,es,esl,spa +swedish UTF_8,ISO_8859_1 swedish,sv,swe +tamil UTF_8 tamil,ta,tam +turkish UTF_8 turkish,tr,tur +yiddish UTF_8 yiddish,yi,yid + +# Also include the traditional porter algorithm for english. +# The porter algorithm is included in the libstemmer distribution to assist +# with backwards compatibility, but for new systems the english algorithm +# should be used in preference. +porter UTF_8,ISO_8859_1 porter english + +# Some other stemmers in the snowball project are not included in the standard +# distribution. To compile a libstemmer with them in, add them to this list, +# and regenerate the distribution. (You will need a full source checkout for +# this.) They are included in the snowball website as curiosities, but are not +# intended for general use, and use of them is is not fully supported. These +# algorithms are: +# +# kraaij_pohlmann - This is a different dutch stemmer. +#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch +# +# lovins - This is an english stemmer, but fairly outdated, and +# only really applicable to a restricted type of input text +# (keywords in academic publications). +#lovins UTF_8,ISO_8859_1 lovins english From f8cfec95710afc79c46264077f7a436b341a89bf Mon Sep 17 00:00:00 2001 From: Saeid Darvish Date: Mon, 26 Feb 2024 20:06:42 +0100 Subject: [PATCH 2/7] fix errors --- algorithms/persian.sbl | 12 +++++------- libstemmer/modules.txt | 1 + python/modules.txt | 1 - 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl index ed537997..2b001c53 100644 --- a/algorithms/persian.sbl +++ b/algorithms/persian.sbl @@ -56,13 +56,11 @@ groupings () define Normalize as ( do repeat ( [substring] among ( - '{U+06A9}' '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf - '{U+06AF}' '{U+06A9}' ( <- '{gaf}' ) // Normalize Gaf - '{U+06CC}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye - '{U+0647}' '{U+0629}' ( <- '{he}' ) // Normalize Heh - '{U+0627}' '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef - '{U+0648}' '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above - '{U+06CC}' '{U+0626}' ( <- '{ye}' ) // Normalize Ye and Ye with Hamza above + '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf + '{U+0626}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye + '{U+0629}' ( <- '{he}' ) // Normalize Heh + '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef + '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above ) ) ) diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt index cd36a219..91df6bae 100644 --- a/libstemmer/modules.txt +++ b/libstemmer/modules.txt @@ -29,6 +29,7 @@ italian UTF_8,ISO_8859_1 italian,it,ita lithuanian UTF_8 lithuanian,lt,lit nepali UTF_8 nepali,ne,nep norwegian UTF_8,ISO_8859_1 norwegian,no,nor +persian UTF_8 persian,fa,fas,pers portuguese UTF_8,ISO_8859_1 portuguese,pt,por romanian UTF_8 romanian,ro,rum,ron russian UTF_8,KOI8_R russian,ru,rus diff --git a/python/modules.txt b/python/modules.txt index 91df6bae..cd36a219 100644 --- a/python/modules.txt +++ b/python/modules.txt @@ -29,7 +29,6 @@ italian UTF_8,ISO_8859_1 italian,it,ita lithuanian UTF_8 lithuanian,lt,lit nepali UTF_8 nepali,ne,nep norwegian UTF_8,ISO_8859_1 norwegian,no,nor -persian UTF_8 persian,fa,fas,pers portuguese UTF_8,ISO_8859_1 portuguese,pt,por romanian UTF_8 romanian,ro,rum,ron russian UTF_8,KOI8_R russian,ru,rus From 0248279fc3b8fc2aed3c3a56ac33cc077ff4adcb Mon Sep 17 00:00:00 2001 From: Saeid Darvish Date: Mon, 26 Feb 2024 23:45:48 +0100 Subject: [PATCH 3/7] add exception section --- algorithms/persian.sbl | 90 ++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl index 2b001c53..63cbeaf3 100644 --- a/algorithms/persian.sbl +++ b/algorithms/persian.sbl @@ -3,6 +3,7 @@ * Author: https://saeiddrv.com */ +stringdef aa '{U+0622}' stringdef alef '{U+0627}' stringdef be '{U+0628}' stringdef pe '{U+067E}' @@ -44,22 +45,20 @@ routines ( Suffix_Verb Suffix_Adjective Post_Normalize + Exception ) externals ( stem ) -groupings () - - define Normalize as ( do repeat ( [substring] among ( '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf '{U+0626}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye '{U+0629}' ( <- '{he}' ) // Normalize Heh - '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef + '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above ) ) @@ -68,36 +67,42 @@ define Normalize as ( define Prefix as ( [substring] among ( - '{be}{alef}{ze}' (delete) // baaz - '{be}{ye}' (delete) // bii - '{be}{re}' (delete) // bar - '{pe}{ye}{shin}' (delete) // pish - '{pe}{re}' (delete) // por - '{pe}{sin}' (delete) // pas - '{dal}{re}' (delete) // dar - '{ze}{ye}{re}' (delete) // ziir - '{sin}{re}' (delete) // sar - '{fe}{re}{alef}' (delete) // faraa - '{he}{mim}' (delete) // ham - '{nun}{alef}' (delete) // naa + '{be}{ye}{shin}' (delete) // bish + '{be}{ye}' ($(len > 3)delete) // bii + '{pe}{ye}{shin}' (delete) // pish + '{pe}{sin}' (delete) // pas + '{ze}{ye}{re}' (delete) // ziir + '{he}{mim}' (delete) // ham + '{nun}{alef}' (delete) // naa + '{mim}{ye}' (delete) // mii ) ) define Suffix_Noun as ( [substring] among ( - '{he}{alef}' (delete) // haa - '{alef}{nun}' (delete) // aan - '{alef}{te}' (delete) // aat + '{gaf}{alef}{heh}' (delete) // gaah + '{he}{alef}{ye}' (delete) // haaye + '{he}{alef}' (delete) // haa + '{alef}{te}' (delete) // aat + '{sin}{te}{alef}{nun}' (delete) // setan + '{ye}{te}' (delete) // yat + + '{gaf}{alef}{nun}' ( <- '{he}' ) // gaan -> h ) ) define Suffix_Verb as ( [substring] among ( - // Past tense - '{ye}{mim}' '{ye}{ye}' '{ye}{ye}{dal}' '{nun}{dal}' (delete) - // Present tense - '{mim}' '{ye}' '{dal}' '{ye}{mim}' '{ye}{dal}' '{nun}{dal}' (delete) + '{ye}{mim}' (delete) + '{ye}{ye}' (delete) + '{ye}{ye}{dal}' (delete) + '{nun}{dal}' (delete) + '{mim}' (delete) + '{ye}' (delete) + '{ye}{mim}' (delete) + '{ye}{dal}' (delete) + '{nun}{dal}' (delete) ) ) @@ -108,8 +113,41 @@ define Suffix_Adjective as ( ) ) +define Exception as ( + [substring] among ( + '{sin}{ye}' + '{dal}{re}' + '{alef}{ye}{nun}' + '{alef}{ye}' + '{be}{re}' + '{nun}{alef}{mim}' + '{pe}{sin}' + '{alef}{sin}{te}{alef}{nun}' + '{be}{re}{alef}{ye}' + '{pe}{ye}{vav}{nun}{dal}' + '{mim}{ye}{lam}{alef}{dal}{ye}' + '{mim}{lam}{ye}' + '{che}{nun}{dal}' + '{be}{re}{khe}{ye}' + '{he}{mim}{ye}{nun}' + '{he}{mim}{alef}{nun}' + '{he}{mim}{he}' + '{mim}{te}{re}' + '{te}{ye}{mim}' + '{sin}{lam}{alef}{mim}' + '{alef}{sin}{lam}{alef}{mim}' + '{ye}{ain}{nun}{ye}' + '{aa}{lam}{be}{vav}{mim}' + ) +) + define stem as ( - do Normalize - do Prefix - do Suffix_Noun or Suffix_Verb or Suffix_Adjective + Exception + or ( + do Normalize + do Prefix + do Suffix_Noun + do Suffix_Adjective + do Suffix_Verb + ) ) From 90c083cc14e34a1f2df0c19d7b09a8ef1ee58f71 Mon Sep 17 00:00:00 2001 From: Saeid Darvish Date: Tue, 27 Feb 2024 08:48:14 +0100 Subject: [PATCH 4/7] fix exception section --- algorithms/persian.sbl | 61 ++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl index 63cbeaf3..91c11f9c 100644 --- a/algorithms/persian.sbl +++ b/algorithms/persian.sbl @@ -39,13 +39,12 @@ stringdef ye '{U+06CC}' routines ( + Exception Normalize Prefix Suffix_Noun - Suffix_Verb Suffix_Adjective - Post_Normalize - Exception + Suffix_Verb ) @@ -55,11 +54,11 @@ externals ( stem ) define Normalize as ( do repeat ( [substring] among ( - '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf + '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf '{U+0626}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye - '{U+0629}' ( <- '{he}' ) // Normalize Heh + '{U+0629}' ( <- '{he}' ) // Normalize Heh '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef - '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above + '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above ) ) ) @@ -97,10 +96,8 @@ define Suffix_Verb as ( '{ye}{mim}' (delete) '{ye}{ye}' (delete) '{ye}{ye}{dal}' (delete) - '{nun}{dal}' (delete) '{mim}' (delete) '{ye}' (delete) - '{ye}{mim}' (delete) '{ye}{dal}' (delete) '{nun}{dal}' (delete) ) @@ -115,34 +112,34 @@ define Suffix_Adjective as ( define Exception as ( [substring] among ( - '{sin}{ye}' - '{dal}{re}' - '{alef}{ye}{nun}' - '{alef}{ye}' - '{be}{re}' - '{nun}{alef}{mim}' - '{pe}{sin}' - '{alef}{sin}{te}{alef}{nun}' - '{be}{re}{alef}{ye}' - '{pe}{ye}{vav}{nun}{dal}' - '{mim}{ye}{lam}{alef}{dal}{ye}' - '{mim}{lam}{ye}' - '{che}{nun}{dal}' - '{be}{re}{khe}{ye}' - '{he}{mim}{ye}{nun}' - '{he}{mim}{alef}{nun}' - '{he}{mim}{he}' - '{mim}{te}{re}' - '{te}{ye}{mim}' - '{sin}{lam}{alef}{mim}' - '{alef}{sin}{lam}{alef}{mim}' - '{ye}{ain}{nun}{ye}' - '{aa}{lam}{be}{vav}{mim}' + '{sin}{ye}' () + '{dal}{re}' () + '{alef}{ye}{nun}' () + '{alef}{ye}' () + '{be}{re}' () + '{nun}{alef}{mim}' () + '{pe}{sin}' () + '{alef}{sin}{te}{alef}{nun}' () + '{be}{re}{alef}{ye}' () + '{pe}{ye}{vav}{nun}{dal}' () + '{mim}{ye}{lam}{alef}{dal}{ye}' () + '{mim}{lam}{ye}' () + '{che}{nun}{dal}' () + '{be}{re}{khe}{ye}' () + '{he}{mim}{ye}{nun}' () + '{he}{mim}{alef}{nun}' () + '{he}{mim}{he}' () + '{mim}{te}{re}' () + '{te}{ye}{mim}' () + '{sin}{lam}{alef}{mim}' () + '{alef}{sin}{lam}{alef}{mim}' () + '{ye}{ain}{nun}{ye}' () + '{aa}{lam}{be}{vav}{mim}' () ) ) define stem as ( - Exception + ( Exception ) or ( do Normalize do Prefix From 40f59b31332249359fe93cda4644075f11f62b4c Mon Sep 17 00:00:00 2001 From: Saeid Darvish Date: Tue, 27 Feb 2024 10:39:59 +0100 Subject: [PATCH 5/7] add Suffix_Normalize --- algorithms/persian.sbl | 66 ++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl index 91c11f9c..a2a36cae 100644 --- a/algorithms/persian.sbl +++ b/algorithms/persian.sbl @@ -3,6 +3,8 @@ * Author: https://saeiddrv.com */ +stringescapes { } + stringdef aa '{U+0622}' stringdef alef '{U+0627}' stringdef be '{U+0628}' @@ -45,12 +47,16 @@ routines ( Suffix_Noun Suffix_Adjective Suffix_Verb + Suffix_Normalize ) externals ( stem ) +groupings ( ) + + define Normalize as ( do repeat ( [substring] among ( @@ -59,6 +65,9 @@ define Normalize as ( '{U+0629}' ( <- '{he}' ) // Normalize Heh '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above + + + '{gaf}{alef}{nun}' ( <- '{he}' ) // gaan -> he ) ) ) @@ -67,46 +76,58 @@ define Normalize as ( define Prefix as ( [substring] among ( '{be}{ye}{shin}' (delete) // bish - '{be}{ye}' ($(len > 3)delete) // bii + '{be}{ye}' ($(len > 3) delete) // bii '{pe}{ye}{shin}' (delete) // pish '{pe}{sin}' (delete) // pas '{ze}{ye}{re}' (delete) // ziir '{he}{mim}' (delete) // ham '{nun}{alef}' (delete) // naa - '{mim}{ye}' (delete) // mii + '{mim}{ye}' ($(len > 3) delete) // mii ) ) define Suffix_Noun as ( - [substring] among ( - '{gaf}{alef}{heh}' (delete) // gaah - '{he}{alef}{ye}' (delete) // haaye - '{he}{alef}' (delete) // haa - '{alef}{te}' (delete) // aat - '{sin}{te}{alef}{nun}' (delete) // setan - '{ye}{te}' (delete) // yat - - '{gaf}{alef}{nun}' ( <- '{he}' ) // gaan -> h + do backwards ( + [substring] among ( + '{gaf}{alef}{heh}' (delete) // gaah + '{he}{alef}{ye}' (delete) // haaye + '{he}{alef}' (delete) // haa + '{alef}{te}' (delete) // aat + '{sin}{te}{alef}{nun}' (delete) // setan + '{ye}{te}' (delete) // yat + ) ) ) define Suffix_Verb as ( - [substring] among ( - '{ye}{mim}' (delete) - '{ye}{ye}' (delete) - '{ye}{ye}{dal}' (delete) - '{mim}' (delete) - '{ye}' (delete) - '{ye}{dal}' (delete) - '{nun}{dal}' (delete) + do backwards ( + [substring] among ( + '{ye}{mim}' ($(len > 3) delete) + '{ye}{ye}' (delete) + '{ye}{ye}{dal}' (delete) + '{mim}' ($(len > 2) delete) + '{ye}' ($(len > 2) delete) + '{ye}{dal}' ($(len > 3) delete) + '{nun}{dal}' ($(len > 3) delete) + ) ) ) define Suffix_Adjective as ( - [substring] among ( - '{te}{re}' (delete) // tar (comparative) - '{te}{re}{ye}{nun}' (delete) // tarin (superlative) + do backwards ( + [substring] among ( + '{te}{re}' (delete) // tar (comparative) + '{te}{re}{ye}{nun}' (delete) // tarin (superlative) + ) + ) +) + +define Suffix_Normalize as ( + do backwards ( + [substring] among ( + '{gaf}{alef}{nun}' ( <- '{he}' ) // gaan -> he + ) ) ) @@ -146,5 +167,6 @@ define stem as ( do Suffix_Noun do Suffix_Adjective do Suffix_Verb + do Suffix_Normalize ) ) From bfc5974ecc691b7816502823644353acb8a46dde Mon Sep 17 00:00:00 2001 From: Saeid Darvish Date: Tue, 27 Feb 2024 11:17:52 +0100 Subject: [PATCH 6/7] fix modules.txt --- python/modules.txt | 61 ---------------------------------------------- 1 file changed, 61 deletions(-) delete mode 100644 python/modules.txt diff --git a/python/modules.txt b/python/modules.txt deleted file mode 100644 index cd36a219..00000000 --- a/python/modules.txt +++ /dev/null @@ -1,61 +0,0 @@ -# This file contains a list of stemmers to include in the distribution. -# The format is a set of space separated lines - on each line: -# First item is name of stemmer. -# Second item is comma separated list of character sets. -# Third item is comma separated list of names to refer to the stemmer by. -# -# Lines starting with a #, or blank lines, are ignored. - -# List all the main algorithms for each language, in UTF-8, and also with -# the most commonly used encoding. - -arabic UTF_8 arabic,ar,ara -armenian UTF_8 armenian,hy,hye,arm -basque UTF_8,ISO_8859_1 basque,eu,eus,baq -catalan UTF_8,ISO_8859_1 catalan,ca,cat -danish UTF_8,ISO_8859_1 danish,da,dan -dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld -english UTF_8,ISO_8859_1 english,en,eng -estonian UTF_8 estonian,et,est -finnish UTF_8,ISO_8859_1 finnish,fi,fin -french UTF_8,ISO_8859_1 french,fr,fre,fra -german UTF_8,ISO_8859_1 german,de,ger,deu -greek UTF_8 greek,el,gre,ell -hindi UTF_8 hindi,hi,hin -hungarian UTF_8,ISO_8859_2 hungarian,hu,hun -indonesian UTF_8,ISO_8859_1 indonesian,id,ind -irish UTF_8,ISO_8859_1 irish,ga,gle -italian UTF_8,ISO_8859_1 italian,it,ita -lithuanian UTF_8 lithuanian,lt,lit -nepali UTF_8 nepali,ne,nep -norwegian UTF_8,ISO_8859_1 norwegian,no,nor -portuguese UTF_8,ISO_8859_1 portuguese,pt,por -romanian UTF_8 romanian,ro,rum,ron -russian UTF_8,KOI8_R russian,ru,rus -serbian UTF_8 serbian,sr,srp -spanish UTF_8,ISO_8859_1 spanish,es,esl,spa -swedish UTF_8,ISO_8859_1 swedish,sv,swe -tamil UTF_8 tamil,ta,tam -turkish UTF_8 turkish,tr,tur -yiddish UTF_8 yiddish,yi,yid - -# Also include the traditional porter algorithm for english. -# The porter algorithm is included in the libstemmer distribution to assist -# with backwards compatibility, but for new systems the english algorithm -# should be used in preference. -porter UTF_8,ISO_8859_1 porter english - -# Some other stemmers in the snowball project are not included in the standard -# distribution. To compile a libstemmer with them in, add them to this list, -# and regenerate the distribution. (You will need a full source checkout for -# this.) They are included in the snowball website as curiosities, but are not -# intended for general use, and use of them is is not fully supported. These -# algorithms are: -# -# kraaij_pohlmann - This is a different dutch stemmer. -#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch -# -# lovins - This is an english stemmer, but fairly outdated, and -# only really applicable to a restricted type of input text -# (keywords in academic publications). -#lovins UTF_8,ISO_8859_1 lovins english From 27975e60c4445a4fbebe2e978974d67ec560ab3d Mon Sep 17 00:00:00 2001 From: Saeid Darvish Date: Thu, 4 Apr 2024 19:52:54 +0200 Subject: [PATCH 7/7] define arabic characters --- algorithms/persian.sbl | 65 ++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl index a2a36cae..8b0c921b 100644 --- a/algorithms/persian.sbl +++ b/algorithms/persian.sbl @@ -40,6 +40,15 @@ stringdef heh '{U+0647}' stringdef ye '{U+06CC}' +stringdef ar_kaf '{U+0643}' +stringdef ar_ye '{U+064A}' +stringdef ar_ye_with_hamza_above '{U+0626}' +stringdef ar_he_marbuta '{U+0629}' +stringdef ar_alef_with_hamza_below '{U+0625}' +stringdef ar_alef_with_hamza_above '{U+0623}' +stringdef ar_vav_with_hamza_above '{U+0624}' + + routines ( Exception Normalize @@ -60,11 +69,11 @@ groupings ( ) define Normalize as ( do repeat ( [substring] among ( - '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf - '{U+0626}' '{U+064A}' ( <- '{ye}' ) // Normalize Ye - '{U+0629}' ( <- '{he}' ) // Normalize Heh - '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef - '{U+0624}' ( <- '{vav}' ) // Normalize Waw and Waw with Hamza above + '{ar_kaf}' ( <- '{kaf}' ) + '{ar_ye_with_hamza_above}' '{ar_ye}' ( <- '{ye}' ) + '{ar_he_marbuta}' ( <- '{he}' ) + '{ar_alef_with_hamza_above}' '{ar_alef_with_hamza_below}' ( <- '{alef}' ) + '{ar_vav_with_hamza_above}' ( <- '{vav}' ) '{gaf}{alef}{nun}' ( <- '{he}' ) // gaan -> he @@ -133,29 +142,29 @@ define Suffix_Normalize as ( define Exception as ( [substring] among ( - '{sin}{ye}' () - '{dal}{re}' () - '{alef}{ye}{nun}' () - '{alef}{ye}' () - '{be}{re}' () - '{nun}{alef}{mim}' () - '{pe}{sin}' () - '{alef}{sin}{te}{alef}{nun}' () - '{be}{re}{alef}{ye}' () - '{pe}{ye}{vav}{nun}{dal}' () - '{mim}{ye}{lam}{alef}{dal}{ye}' () - '{mim}{lam}{ye}' () - '{che}{nun}{dal}' () - '{be}{re}{khe}{ye}' () - '{he}{mim}{ye}{nun}' () - '{he}{mim}{alef}{nun}' () - '{he}{mim}{he}' () - '{mim}{te}{re}' () - '{te}{ye}{mim}' () - '{sin}{lam}{alef}{mim}' () - '{alef}{sin}{lam}{alef}{mim}' () - '{ye}{ain}{nun}{ye}' () - '{aa}{lam}{be}{vav}{mim}' () + '{sin}{ye}' + '{dal}{re}' + '{alef}{ye}{nun}' + '{alef}{ye}' + '{be}{re}' + '{nun}{alef}{mim}' + '{pe}{sin}' + '{alef}{sin}{te}{alef}{nun}' + '{be}{re}{alef}{ye}' + '{pe}{ye}{vav}{nun}{dal}' + '{mim}{ye}{lam}{alef}{dal}{ye}' + '{mim}{lam}{ye}' + '{che}{nun}{dal}' + '{be}{re}{khe}{ye}' + '{he}{mim}{ye}{nun}' + '{he}{mim}{alef}{nun}' + '{he}{mim}{he}' + '{mim}{te}{re}' + '{te}{ye}{mim}' + '{sin}{lam}{alef}{mim}' + '{alef}{sin}{lam}{alef}{mim}' + '{ye}{ain}{nun}{ye}' + '{aa}{lam}{be}{vav}{mim}' ) )