From 44057d0c0d24083e85da772c0d566c28214d7a79 Mon Sep 17 00:00:00 2001
From: Saeid Darvish <saeid.drgh@gmail.com>
Date: Wed, 21 Feb 2024 21:55:31 +0100
Subject: [PATCH 1/7] create persian.sbl

---
 algorithms/persian.sbl | 117 +++++++++++++++++++++++++++++++++++++++++
 python/modules.txt     |  62 ++++++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 algorithms/persian.sbl
 create mode 100644 python/modules.txt

diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl
new file mode 100644
index 00000000..ed537997
--- /dev/null
+++ b/algorithms/persian.sbl
@@ -0,0 +1,117 @@
+/*
+ * Persian Stemming Algorithm
+ * Author: https://saeiddrv.com
+*/
+
+stringdef alef    '{U+0627}'
+stringdef be      '{U+0628}'
+stringdef pe      '{U+067E}'
+stringdef te      '{U+062A}'
+stringdef se      '{U+0633}'
+stringdef jim     '{U+062C}'
+stringdef che     '{U+0686}'
+stringdef he      '{U+0647}'
+stringdef khe     '{U+062E}'
+stringdef dal     '{U+062F}'
+stringdef zal     '{U+0630}'
+stringdef re      '{U+0631}'
+stringdef ze      '{U+0632}'
+stringdef zhe     '{U+0698}'
+stringdef sin     '{U+0633}'
+stringdef shin    '{U+0634}'
+stringdef sad     '{U+0635}'
+stringdef zad     '{U+0636}'
+stringdef ta      '{U+0637}'
+stringdef za      '{U+0638}'
+stringdef ain     '{U+0639}'
+stringdef ghain   '{U+063A}'
+stringdef fe      '{U+0641}'
+stringdef ghaf    '{U+0642}'
+stringdef kaf     '{U+06A9}'
+stringdef gaf     '{U+06AF}'
+stringdef lam     '{U+0644}'
+stringdef mim     '{U+0645}'
+stringdef nun     '{U+0646}'
+stringdef vav     '{U+0648}'
+stringdef heh     '{U+0647}'
+stringdef ye      '{U+06CC}'
+
+
+routines (
+    Normalize
+    Prefix
+    Suffix_Noun
+    Suffix_Verb
+    Suffix_Adjective
+    Post_Normalize
+)
+
+
+externals ( stem )
+
+
+groupings ()
+
+
+define Normalize as (
+    do repeat (
+        [substring] among (
+            '{U+06A9}' '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf
+            '{U+06AF}' '{U+06A9}' ( <- '{gaf}' ) // Normalize Gaf
+            '{U+06CC}' '{U+064A}' ( <- '{ye}'  )  // Normalize Ye
+            '{U+0647}' '{U+0629}' ( <- '{he}'  )  // Normalize Heh
+            '{U+0627}' '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
+            '{U+0648}' '{U+0624}' ( <- '{vav}' )  // Normalize Waw and Waw with Hamza above
+            '{U+06CC}' '{U+0626}' ( <- '{ye}' )   // Normalize Ye and Ye with Hamza above
+        )
+    )
+)
+
+
+define Prefix as (
+    [substring] among (
+        '{be}{alef}{ze}' (delete)  // baaz
+        '{be}{ye}' (delete)        // bii
+        '{be}{re}' (delete)        // bar
+        '{pe}{ye}{shin}' (delete)  // pish
+        '{pe}{re}' (delete)        // por
+        '{pe}{sin}' (delete)       // pas
+        '{dal}{re}' (delete)       // dar
+        '{ze}{ye}{re}' (delete)    // ziir
+        '{sin}{re}' (delete)       // sar
+        '{fe}{re}{alef}' (delete)  // faraa
+        '{he}{mim}' (delete)       // ham
+        '{nun}{alef}' (delete)     // naa
+    )
+)
+
+
+define Suffix_Noun as (
+    [substring] among (
+        '{he}{alef}' (delete)  // haa
+        '{alef}{nun}' (delete) // aan
+        '{alef}{te}' (delete)  // aat
+    )
+)
+
+define Suffix_Verb as (
+    [substring] among (
+        // Past tense
+        '{ye}{mim}' '{ye}{ye}' '{ye}{ye}{dal}' '{nun}{dal}' (delete)
+        // Present tense
+        '{mim}' '{ye}' '{dal}' '{ye}{mim}' '{ye}{dal}' '{nun}{dal}' (delete)
+    )
+)
+
+define Suffix_Adjective as (
+    [substring] among (
+        '{te}{re}' (delete) // tar (comparative)
+        '{te}{re}{ye}{nun}' (delete)  // tarin (superlative)
+    )
+)
+
+define stem as (
+    do Normalize
+    do Prefix
+    do Suffix_Noun or Suffix_Verb or Suffix_Adjective
+)
diff --git a/python/modules.txt b/python/modules.txt
new file mode 100644
index 00000000..91df6bae
--- /dev/null
+++ b/python/modules.txt
@@ -0,0 +1,62 @@
+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+#  First item is name of stemmer.
+#  Second item is comma separated list of character sets.
+#  Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8, and also with
+# the most commonly used encoding.
+
+arabic          UTF_8                   arabic,ar,ara
+armenian        UTF_8                   armenian,hy,hye,arm
+basque          UTF_8,ISO_8859_1        basque,eu,eus,baq
+catalan         UTF_8,ISO_8859_1        catalan,ca,cat
+danish          UTF_8,ISO_8859_1        danish,da,dan
+dutch           UTF_8,ISO_8859_1        dutch,nl,dut,nld
+english         UTF_8,ISO_8859_1        english,en,eng
+estonian        UTF_8                   estonian,et,est
+finnish         UTF_8,ISO_8859_1        finnish,fi,fin
+french          UTF_8,ISO_8859_1        french,fr,fre,fra
+german          UTF_8,ISO_8859_1        german,de,ger,deu
+greek           UTF_8                   greek,el,gre,ell
+hindi           UTF_8                   hindi,hi,hin
+hungarian       UTF_8,ISO_8859_2        hungarian,hu,hun
+indonesian      UTF_8,ISO_8859_1        indonesian,id,ind
+irish           UTF_8,ISO_8859_1        irish,ga,gle
+italian         UTF_8,ISO_8859_1        italian,it,ita
+lithuanian      UTF_8                   lithuanian,lt,lit
+nepali          UTF_8                   nepali,ne,nep
+norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
+persian         UTF_8                   persian,fa,fas,pers
+portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
+romanian        UTF_8                   romanian,ro,rum,ron
+russian         UTF_8,KOI8_R            russian,ru,rus
+serbian         UTF_8                   serbian,sr,srp
+spanish         UTF_8,ISO_8859_1        spanish,es,esl,spa
+swedish         UTF_8,ISO_8859_1        swedish,sv,swe
+tamil           UTF_8                   tamil,ta,tam
+turkish         UTF_8                   turkish,tr,tur
+yiddish         UTF_8                   yiddish,yi,yid
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter          UTF_8,ISO_8859_1        porter			english
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported.  These
+# algorithms are:
+#
+# kraaij_pohlmann  - This is a different dutch stemmer.
+#kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann	dutch
+#
+# lovins           - This is an english stemmer, but fairly outdated, and
+#                    only really applicable to a restricted type of input text
+#                    (keywords in academic publications).
+#lovins           UTF_8,ISO_8859_1        lovins		english

From f8cfec95710afc79c46264077f7a436b341a89bf Mon Sep 17 00:00:00 2001
From: Saeid Darvish <saeid.drgh@gmail.com>
Date: Mon, 26 Feb 2024 20:06:42 +0100
Subject: [PATCH 2/7] fix errors

---
 algorithms/persian.sbl | 12 +++++-------
 libstemmer/modules.txt |  1 +
 python/modules.txt     |  1 -
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl
index ed537997..2b001c53 100644
--- a/algorithms/persian.sbl
+++ b/algorithms/persian.sbl
@@ -56,13 +56,11 @@ groupings ()
 define Normalize as (
     do repeat (
         [substring] among (
-            '{U+06A9}' '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf
-            '{U+06AF}' '{U+06A9}' ( <- '{gaf}' ) // Normalize Gaf
-            '{U+06CC}' '{U+064A}' ( <- '{ye}'  )  // Normalize Ye
-            '{U+0647}' '{U+0629}' ( <- '{he}'  )  // Normalize Heh
-            '{U+0627}' '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
-            '{U+0648}' '{U+0624}' ( <- '{vav}' )  // Normalize Waw and Waw with Hamza above
-            '{U+06CC}' '{U+0626}' ( <- '{ye}' )   // Normalize Ye and Ye with Hamza above
+            '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf
+            '{U+0626}' '{U+064A}' ( <- '{ye}'  )  // Normalize Ye
+            '{U+0629}' ( <- '{he}'  )  // Normalize Heh
+            '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
+            '{U+0624}' ( <- '{vav}' )  // Normalize Waw and Waw with Hamza above
         )
     )
 )
diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt
index cd36a219..91df6bae 100644
--- a/libstemmer/modules.txt
+++ b/libstemmer/modules.txt
@@ -29,6 +29,7 @@ italian         UTF_8,ISO_8859_1        italian,it,ita
 lithuanian      UTF_8                   lithuanian,lt,lit
 nepali          UTF_8                   nepali,ne,nep
 norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
+persian         UTF_8                   persian,fa,fas,pers
 portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
 romanian        UTF_8                   romanian,ro,rum,ron
 russian         UTF_8,KOI8_R            russian,ru,rus
diff --git a/python/modules.txt b/python/modules.txt
index 91df6bae..cd36a219 100644
--- a/python/modules.txt
+++ b/python/modules.txt
@@ -29,7 +29,6 @@ italian         UTF_8,ISO_8859_1        italian,it,ita
 lithuanian      UTF_8                   lithuanian,lt,lit
 nepali          UTF_8                   nepali,ne,nep
 norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
-persian         UTF_8                   persian,fa,fas,pers
 portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
 romanian        UTF_8                   romanian,ro,rum,ron
 russian         UTF_8,KOI8_R            russian,ru,rus

From 0248279fc3b8fc2aed3c3a56ac33cc077ff4adcb Mon Sep 17 00:00:00 2001
From: Saeid Darvish <saeid.drgh@gmail.com>
Date: Mon, 26 Feb 2024 23:45:48 +0100
Subject: [PATCH 3/7] add exception section

---
 algorithms/persian.sbl | 90 ++++++++++++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 26 deletions(-)

diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl
index 2b001c53..63cbeaf3 100644
--- a/algorithms/persian.sbl
+++ b/algorithms/persian.sbl
@@ -3,6 +3,7 @@
  * Author: https://saeiddrv.com
 */
 
+stringdef aa      '{U+0622}'
 stringdef alef    '{U+0627}'
 stringdef be      '{U+0628}'
 stringdef pe      '{U+067E}'
@@ -44,22 +45,20 @@ routines (
     Suffix_Verb
     Suffix_Adjective
     Post_Normalize
+    Exception
 )
 
 
 externals ( stem )
 
 
-groupings ()
-
-
 define Normalize as (
     do repeat (
         [substring] among (
             '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf
             '{U+0626}' '{U+064A}' ( <- '{ye}'  )  // Normalize Ye
             '{U+0629}' ( <- '{he}'  )  // Normalize Heh
-            '{U+0622}' '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
+            '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
             '{U+0624}' ( <- '{vav}' )  // Normalize Waw and Waw with Hamza above
         )
     )
@@ -68,36 +67,42 @@ define Normalize as (
 
 define Prefix as (
     [substring] among (
-        '{be}{alef}{ze}' (delete)  // baaz
-        '{be}{ye}' (delete)        // bii
-        '{be}{re}' (delete)        // bar
-        '{pe}{ye}{shin}' (delete)  // pish
-        '{pe}{re}' (delete)        // por
-        '{pe}{sin}' (delete)       // pas
-        '{dal}{re}' (delete)       // dar
-        '{ze}{ye}{re}' (delete)    // ziir
-        '{sin}{re}' (delete)       // sar
-        '{fe}{re}{alef}' (delete)  // faraa
-        '{he}{mim}' (delete)       // ham
-        '{nun}{alef}' (delete)     // naa
+        '{be}{ye}{shin}' (delete)       // bish
+        '{be}{ye}' ($(len > 3)delete)   // bii
+        '{pe}{ye}{shin}' (delete)       // pish
+        '{pe}{sin}' (delete)            // pas
+        '{ze}{ye}{re}' (delete)         // ziir
+        '{he}{mim}' (delete)            // ham
+        '{nun}{alef}' (delete)          // naa
+        '{mim}{ye}' (delete)            // mii
     )
 )
 
 
 define Suffix_Noun as (
     [substring] among (
-        '{he}{alef}' (delete)  // haa
-        '{alef}{nun}' (delete) // aan
-        '{alef}{te}' (delete)  // aat
+        '{gaf}{alef}{heh}' (delete)       // gaah
+        '{he}{alef}{ye}' (delete)         // haaye
+        '{he}{alef}' (delete)             // haa
+        '{alef}{te}' (delete)             // aat
+        '{sin}{te}{alef}{nun}' (delete)   // setan
+        '{ye}{te}' (delete)               // yat
+
+        '{gaf}{alef}{nun}' ( <- '{he}'  ) // gaan -> h
     )
 )
 
 define Suffix_Verb as (
     [substring] among (
-        // Past tense
-        '{ye}{mim}' '{ye}{ye}' '{ye}{ye}{dal}' '{nun}{dal}' (delete)
-        // Present tense
-        '{mim}' '{ye}' '{dal}' '{ye}{mim}' '{ye}{dal}' '{nun}{dal}' (delete)
+        '{ye}{mim}' (delete)
+        '{ye}{ye}' (delete)
+        '{ye}{ye}{dal}' (delete)
+        '{nun}{dal}' (delete)
+        '{mim}' (delete)
+        '{ye}' (delete)
+        '{ye}{mim}' (delete)
+        '{ye}{dal}' (delete)
+        '{nun}{dal}' (delete)
     )
 )
 
@@ -108,8 +113,41 @@ define Suffix_Adjective as (
     )
 )
 
+define Exception as (
+    [substring] among (
+        '{sin}{ye}'
+        '{dal}{re}'
+        '{alef}{ye}{nun}'
+        '{alef}{ye}'
+        '{be}{re}'
+        '{nun}{alef}{mim}'
+        '{pe}{sin}'
+        '{alef}{sin}{te}{alef}{nun}'
+        '{be}{re}{alef}{ye}'
+        '{pe}{ye}{vav}{nun}{dal}'
+        '{mim}{ye}{lam}{alef}{dal}{ye}'
+        '{mim}{lam}{ye}'
+        '{che}{nun}{dal}'
+        '{be}{re}{khe}{ye}'
+        '{he}{mim}{ye}{nun}'
+        '{he}{mim}{alef}{nun}'
+        '{he}{mim}{he}'
+        '{mim}{te}{re}'
+        '{te}{ye}{mim}'
+        '{sin}{lam}{alef}{mim}'
+        '{alef}{sin}{lam}{alef}{mim}'
+        '{ye}{ain}{nun}{ye}'
+        '{aa}{lam}{be}{vav}{mim}'
+    )
+)
+
 define stem as (
-    do Normalize
-    do Prefix
-    do Suffix_Noun or Suffix_Verb or Suffix_Adjective
+    Exception
+    or (
+        do Normalize
+        do Prefix
+        do Suffix_Noun
+        do Suffix_Adjective
+        do Suffix_Verb
+    )
 )

From 90c083cc14e34a1f2df0c19d7b09a8ef1ee58f71 Mon Sep 17 00:00:00 2001
From: Saeid Darvish <saeid.drgh@gmail.com>
Date: Tue, 27 Feb 2024 08:48:14 +0100
Subject: [PATCH 4/7] fix exception section

---
 algorithms/persian.sbl | 61 ++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 32 deletions(-)

diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl
index 63cbeaf3..91c11f9c 100644
--- a/algorithms/persian.sbl
+++ b/algorithms/persian.sbl
@@ -39,13 +39,12 @@ stringdef ye      '{U+06CC}'
 
 
 routines (
+    Exception
     Normalize
     Prefix
     Suffix_Noun
-    Suffix_Verb
     Suffix_Adjective
-    Post_Normalize
-    Exception
+    Suffix_Verb
 )
 
 
@@ -55,11 +54,11 @@ externals ( stem )
 define Normalize as (
     do repeat (
         [substring] among (
-            '{U+0643}' ( <- '{kaf}' ) // Normalize Kaf
+            '{U+0643}' ( <- '{kaf}' )             // Normalize Kaf
             '{U+0626}' '{U+064A}' ( <- '{ye}'  )  // Normalize Ye
-            '{U+0629}' ( <- '{he}'  )  // Normalize Heh
+            '{U+0629}' ( <- '{he}'  )             // Normalize Heh
             '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
-            '{U+0624}' ( <- '{vav}' )  // Normalize Waw and Waw with Hamza above
+            '{U+0624}' ( <- '{vav}' )             // Normalize Waw and Waw with Hamza above
         )
     )
 )
@@ -97,10 +96,8 @@ define Suffix_Verb as (
         '{ye}{mim}' (delete)
         '{ye}{ye}' (delete)
         '{ye}{ye}{dal}' (delete)
-        '{nun}{dal}' (delete)
         '{mim}' (delete)
         '{ye}' (delete)
-        '{ye}{mim}' (delete)
         '{ye}{dal}' (delete)
         '{nun}{dal}' (delete)
     )
@@ -115,34 +112,34 @@ define Suffix_Adjective as (
 
 define Exception as (
     [substring] among (
-        '{sin}{ye}'
-        '{dal}{re}'
-        '{alef}{ye}{nun}'
-        '{alef}{ye}'
-        '{be}{re}'
-        '{nun}{alef}{mim}'
-        '{pe}{sin}'
-        '{alef}{sin}{te}{alef}{nun}'
-        '{be}{re}{alef}{ye}'
-        '{pe}{ye}{vav}{nun}{dal}'
-        '{mim}{ye}{lam}{alef}{dal}{ye}'
-        '{mim}{lam}{ye}'
-        '{che}{nun}{dal}'
-        '{be}{re}{khe}{ye}'
-        '{he}{mim}{ye}{nun}'
-        '{he}{mim}{alef}{nun}'
-        '{he}{mim}{he}'
-        '{mim}{te}{re}'
-        '{te}{ye}{mim}'
-        '{sin}{lam}{alef}{mim}'
-        '{alef}{sin}{lam}{alef}{mim}'
-        '{ye}{ain}{nun}{ye}'
-        '{aa}{lam}{be}{vav}{mim}'
+        '{sin}{ye}' ()
+        '{dal}{re}' ()
+        '{alef}{ye}{nun}' ()
+        '{alef}{ye}' ()
+        '{be}{re}' ()
+        '{nun}{alef}{mim}' ()
+        '{pe}{sin}' ()
+        '{alef}{sin}{te}{alef}{nun}' ()
+        '{be}{re}{alef}{ye}' ()
+        '{pe}{ye}{vav}{nun}{dal}' ()
+        '{mim}{ye}{lam}{alef}{dal}{ye}' ()
+        '{mim}{lam}{ye}' ()
+        '{che}{nun}{dal}' ()
+        '{be}{re}{khe}{ye}' ()
+        '{he}{mim}{ye}{nun}' ()
+        '{he}{mim}{alef}{nun}' ()
+        '{he}{mim}{he}' ()
+        '{mim}{te}{re}' ()
+        '{te}{ye}{mim}' ()
+        '{sin}{lam}{alef}{mim}' ()
+        '{alef}{sin}{lam}{alef}{mim}' ()
+        '{ye}{ain}{nun}{ye}' ()
+        '{aa}{lam}{be}{vav}{mim}' ()
     )
 )
 
 define stem as (
-    Exception
+    ( Exception )
     or (
         do Normalize
         do Prefix

From 40f59b31332249359fe93cda4644075f11f62b4c Mon Sep 17 00:00:00 2001
From: Saeid Darvish <saeid.drgh@gmail.com>
Date: Tue, 27 Feb 2024 10:39:59 +0100
Subject: [PATCH 5/7] add Suffix_Normalize

---
 algorithms/persian.sbl | 66 ++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl
index 91c11f9c..a2a36cae 100644
--- a/algorithms/persian.sbl
+++ b/algorithms/persian.sbl
@@ -3,6 +3,8 @@
  * Author: https://saeiddrv.com
 */
 
+stringescapes { }
+
 stringdef aa      '{U+0622}'
 stringdef alef    '{U+0627}'
 stringdef be      '{U+0628}'
@@ -45,12 +47,16 @@ routines (
     Suffix_Noun
     Suffix_Adjective
     Suffix_Verb
+    Suffix_Normalize
 )
 
 
 externals ( stem )
 
 
+groupings (  )
+
+
 define Normalize as (
     do repeat (
         [substring] among (
@@ -59,6 +65,9 @@ define Normalize as (
             '{U+0629}' ( <- '{he}'  )             // Normalize Heh
             '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
             '{U+0624}' ( <- '{vav}' )             // Normalize Waw and Waw with Hamza above
+
+
+            '{gaf}{alef}{nun}' ( <- '{he}'  )     // gaan -> he
         )
     )
 )
@@ -67,46 +76,58 @@ define Normalize as (
 define Prefix as (
     [substring] among (
         '{be}{ye}{shin}' (delete)       // bish
-        '{be}{ye}' ($(len > 3)delete)   // bii
+        '{be}{ye}' ($(len > 3) delete)  // bii
         '{pe}{ye}{shin}' (delete)       // pish
         '{pe}{sin}' (delete)            // pas
         '{ze}{ye}{re}' (delete)         // ziir
         '{he}{mim}' (delete)            // ham
         '{nun}{alef}' (delete)          // naa
-        '{mim}{ye}' (delete)            // mii
+        '{mim}{ye}' ($(len > 3) delete) // mii
     )
 )
 
 
 define Suffix_Noun as (
-    [substring] among (
-        '{gaf}{alef}{heh}' (delete)       // gaah
-        '{he}{alef}{ye}' (delete)         // haaye
-        '{he}{alef}' (delete)             // haa
-        '{alef}{te}' (delete)             // aat
-        '{sin}{te}{alef}{nun}' (delete)   // setan
-        '{ye}{te}' (delete)               // yat
-
-        '{gaf}{alef}{nun}' ( <- '{he}'  ) // gaan -> h
+    do backwards (
+        [substring] among (
+            '{gaf}{alef}{heh}' (delete)       // gaah
+            '{he}{alef}{ye}' (delete)         // haaye
+            '{he}{alef}' (delete)             // haa
+            '{alef}{te}' (delete)             // aat
+            '{sin}{te}{alef}{nun}' (delete)   // setan
+            '{ye}{te}' (delete)               // yat
+        )
     )
 )
 
 define Suffix_Verb as (
-    [substring] among (
-        '{ye}{mim}' (delete)
-        '{ye}{ye}' (delete)
-        '{ye}{ye}{dal}' (delete)
-        '{mim}' (delete)
-        '{ye}' (delete)
-        '{ye}{dal}' (delete)
-        '{nun}{dal}' (delete)
+    do backwards (
+        [substring] among (
+            '{ye}{mim}' ($(len > 3) delete)
+            '{ye}{ye}' (delete)
+            '{ye}{ye}{dal}' (delete)
+            '{mim}' ($(len > 2) delete)
+            '{ye}' ($(len > 2) delete)
+            '{ye}{dal}' ($(len > 3) delete)
+            '{nun}{dal}' ($(len > 3) delete)
+        )
     )
 )
 
 define Suffix_Adjective as (
-    [substring] among (
-        '{te}{re}' (delete) // tar (comparative)
-        '{te}{re}{ye}{nun}' (delete)  // tarin (superlative)
+    do backwards (
+        [substring] among (
+            '{te}{re}' (delete)           // tar (comparative)
+            '{te}{re}{ye}{nun}' (delete)  // tarin (superlative)
+        )
+    )
+)
+
+define Suffix_Normalize as (
+    do backwards (
+        [substring] among (
+            '{gaf}{alef}{nun}' ( <- '{he}'  )     // gaan -> he
+        )
     )
 )
 
@@ -146,5 +167,6 @@ define stem as (
         do Suffix_Noun
         do Suffix_Adjective
         do Suffix_Verb
+        do Suffix_Normalize
     )
 )

From bfc5974ecc691b7816502823644353acb8a46dde Mon Sep 17 00:00:00 2001
From: Saeid Darvish <saeid.drgh@gmail.com>
Date: Tue, 27 Feb 2024 11:17:52 +0100
Subject: [PATCH 6/7] fix modules.txt

---
 python/modules.txt | 61 ----------------------------------------------
 1 file changed, 61 deletions(-)
 delete mode 100644 python/modules.txt

diff --git a/python/modules.txt b/python/modules.txt
deleted file mode 100644
index cd36a219..00000000
--- a/python/modules.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-# This file contains a list of stemmers to include in the distribution.
-# The format is a set of space separated lines - on each line:
-#  First item is name of stemmer.
-#  Second item is comma separated list of character sets.
-#  Third item is comma separated list of names to refer to the stemmer by.
-#
-# Lines starting with a #, or blank lines, are ignored.
-
-# List all the main algorithms for each language, in UTF-8, and also with
-# the most commonly used encoding.
-
-arabic          UTF_8                   arabic,ar,ara
-armenian        UTF_8                   armenian,hy,hye,arm
-basque          UTF_8,ISO_8859_1        basque,eu,eus,baq
-catalan         UTF_8,ISO_8859_1        catalan,ca,cat
-danish          UTF_8,ISO_8859_1        danish,da,dan
-dutch           UTF_8,ISO_8859_1        dutch,nl,dut,nld
-english         UTF_8,ISO_8859_1        english,en,eng
-estonian        UTF_8                   estonian,et,est
-finnish         UTF_8,ISO_8859_1        finnish,fi,fin
-french          UTF_8,ISO_8859_1        french,fr,fre,fra
-german          UTF_8,ISO_8859_1        german,de,ger,deu
-greek           UTF_8                   greek,el,gre,ell
-hindi           UTF_8                   hindi,hi,hin
-hungarian       UTF_8,ISO_8859_2        hungarian,hu,hun
-indonesian      UTF_8,ISO_8859_1        indonesian,id,ind
-irish           UTF_8,ISO_8859_1        irish,ga,gle
-italian         UTF_8,ISO_8859_1        italian,it,ita
-lithuanian      UTF_8                   lithuanian,lt,lit
-nepali          UTF_8                   nepali,ne,nep
-norwegian       UTF_8,ISO_8859_1        norwegian,no,nor
-portuguese      UTF_8,ISO_8859_1        portuguese,pt,por
-romanian        UTF_8                   romanian,ro,rum,ron
-russian         UTF_8,KOI8_R            russian,ru,rus
-serbian         UTF_8                   serbian,sr,srp
-spanish         UTF_8,ISO_8859_1        spanish,es,esl,spa
-swedish         UTF_8,ISO_8859_1        swedish,sv,swe
-tamil           UTF_8                   tamil,ta,tam
-turkish         UTF_8                   turkish,tr,tur
-yiddish         UTF_8                   yiddish,yi,yid
-
-# Also include the traditional porter algorithm for english.
-# The porter algorithm is included in the libstemmer distribution to assist
-# with backwards compatibility, but for new systems the english algorithm
-# should be used in preference.
-porter          UTF_8,ISO_8859_1        porter			english
-
-# Some other stemmers in the snowball project are not included in the standard
-# distribution. To compile a libstemmer with them in, add them to this list,
-# and regenerate the distribution. (You will need a full source checkout for
-# this.) They are included in the snowball website as curiosities, but are not
-# intended for general use, and use of them is is not fully supported.  These
-# algorithms are:
-#
-# kraaij_pohlmann  - This is a different dutch stemmer.
-#kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann	dutch
-#
-# lovins           - This is an english stemmer, but fairly outdated, and
-#                    only really applicable to a restricted type of input text
-#                    (keywords in academic publications).
-#lovins           UTF_8,ISO_8859_1        lovins		english

From 27975e60c4445a4fbebe2e978974d67ec560ab3d Mon Sep 17 00:00:00 2001
From: Saeid Darvish <saeid.drgh@gmail.com>
Date: Thu, 4 Apr 2024 19:52:54 +0200
Subject: [PATCH 7/7] define arabic characters

---
 algorithms/persian.sbl | 65 ++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/algorithms/persian.sbl b/algorithms/persian.sbl
index a2a36cae..8b0c921b 100644
--- a/algorithms/persian.sbl
+++ b/algorithms/persian.sbl
@@ -40,6 +40,15 @@ stringdef heh     '{U+0647}'
 stringdef ye      '{U+06CC}'
 
 
+stringdef ar_kaf                     '{U+0643}'
+stringdef ar_ye                      '{U+064A}'
+stringdef ar_ye_with_hamza_above     '{U+0626}'
+stringdef ar_he_marbuta              '{U+0629}'
+stringdef ar_alef_with_hamza_below   '{U+0625}'
+stringdef ar_alef_with_hamza_above   '{U+0623}'
+stringdef ar_vav_with_hamza_above    '{U+0624}'
+
+
 routines (
     Exception
     Normalize
@@ -60,11 +69,11 @@ groupings (  )
 define Normalize as (
     do repeat (
         [substring] among (
-            '{U+0643}' ( <- '{kaf}' )             // Normalize Kaf
-            '{U+0626}' '{U+064A}' ( <- '{ye}'  )  // Normalize Ye
-            '{U+0629}' ( <- '{he}'  )             // Normalize Heh
-            '{U+0623}' '{U+0625}' ( <- '{alef}' ) // Normalize Alef
-            '{U+0624}' ( <- '{vav}' )             // Normalize Waw and Waw with Hamza above
+            '{ar_kaf}' ( <- '{kaf}' )
+            '{ar_ye_with_hamza_above}' '{ar_ye}' ( <- '{ye}'  )
+            '{ar_he_marbuta}' ( <- '{he}'  )
+            '{ar_alef_with_hamza_above}' '{ar_alef_with_hamza_below}' ( <- '{alef}' )
+            '{ar_vav_with_hamza_above}' ( <- '{vav}' )
 
 
             '{gaf}{alef}{nun}' ( <- '{he}'  )     // gaan -> he
@@ -133,29 +142,29 @@ define Suffix_Normalize as (
 
 define Exception as (
     [substring] among (
-        '{sin}{ye}' ()
-        '{dal}{re}' ()
-        '{alef}{ye}{nun}' ()
-        '{alef}{ye}' ()
-        '{be}{re}' ()
-        '{nun}{alef}{mim}' ()
-        '{pe}{sin}' ()
-        '{alef}{sin}{te}{alef}{nun}' ()
-        '{be}{re}{alef}{ye}' ()
-        '{pe}{ye}{vav}{nun}{dal}' ()
-        '{mim}{ye}{lam}{alef}{dal}{ye}' ()
-        '{mim}{lam}{ye}' ()
-        '{che}{nun}{dal}' ()
-        '{be}{re}{khe}{ye}' ()
-        '{he}{mim}{ye}{nun}' ()
-        '{he}{mim}{alef}{nun}' ()
-        '{he}{mim}{he}' ()
-        '{mim}{te}{re}' ()
-        '{te}{ye}{mim}' ()
-        '{sin}{lam}{alef}{mim}' ()
-        '{alef}{sin}{lam}{alef}{mim}' ()
-        '{ye}{ain}{nun}{ye}' ()
-        '{aa}{lam}{be}{vav}{mim}' ()
+        '{sin}{ye}'
+        '{dal}{re}'
+        '{alef}{ye}{nun}'
+        '{alef}{ye}'
+        '{be}{re}'
+        '{nun}{alef}{mim}'
+        '{pe}{sin}'
+        '{alef}{sin}{te}{alef}{nun}'
+        '{be}{re}{alef}{ye}'
+        '{pe}{ye}{vav}{nun}{dal}'
+        '{mim}{ye}{lam}{alef}{dal}{ye}'
+        '{mim}{lam}{ye}'
+        '{che}{nun}{dal}'
+        '{be}{re}{khe}{ye}'
+        '{he}{mim}{ye}{nun}'
+        '{he}{mim}{alef}{nun}'
+        '{he}{mim}{he}'
+        '{mim}{te}{re}'
+        '{te}{ye}{mim}'
+        '{sin}{lam}{alef}{mim}'
+        '{alef}{sin}{lam}{alef}{mim}'
+        '{ye}{ain}{nun}{ye}'
+        '{aa}{lam}{be}{vav}{mim}'
     )
 )