Skip to content

Commit

Permalink
Merge pull request #2208 from mprasil/add-slovak-language-support
Browse files Browse the repository at this point in the history
Add support for Slovak language
  • Loading branch information
eikek authored Dec 19, 2023
2 parents 2896ab2 + dcafd7b commit b02a5c2
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 1 deletion.
1 change: 1 addition & 0 deletions docker/dockerfiles/joex.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ RUN apk update && \
tesseract-ocr-data-pol \
tesseract-ocr-data-est \
tesseract-ocr-data-ukr \
tesseract-ocr-data-slk \
unpaper \
weasyprint \
libreoffice \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ object DateFind {
case Language.Estonian => dmy
case Language.Khmer => dmy
case Language.Ukrainian => dmy.or(ymd)
case Language.Slovak => dmy.or(ymd)
}
p.read(parts) match {
case Result.Success(sds, _) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ object MonthName {
ukrainian
case Language.Khmer =>
khmer
case Language.Slovak =>
slovak
}

private val numbers = List(
Expand Down Expand Up @@ -426,4 +428,19 @@ object MonthName {
List("листопада", "лист", "лис"),
List("грудня", "груд", "гру")
)

private val slovak = List(
List("jan", "január", "januára"),
List("feb", "február", "februára"),
List("mar", "marec", "marca"),
List("apr", "apríl", "apríla"),
List("maj", "máj", "mája"),
List("jun", "jún", "júna"),
List("jul", "júl", "júla"),
List("aug", "august", "augusta"),
List("sep", "september", "septembra"),
List("okt", "október", "októbra"),
List("nov", "november", "novembra"),
List("dec", "december", "decembra")
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -365,4 +365,57 @@ class DateFindTest extends FunSuite {
)
)
}

test("find slovak dates") {
assertEquals(
DateFind
.findDates(
"Do funkcie bola inaugurovaná 15. júna 2019 pred Národnou radou SR",
Language.Slovak
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2019, 6, 15),
NerLabel("15. júna 2019", NerTag.Date, 29, 42)
)
)
)
assertEquals(
DateFind
.findDates(
"Dátum narodenia: 14. feb 2015",
Language.Slovak
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2015, 2, 14),
NerLabel("14. feb 2015", NerTag.Date, 17, 29)
)
)
)
assertEquals(
DateFind
.findDates("19.11.2021", Language.Slovak)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("19.11.2021", NerTag.Date, 0, 10)
)
)
)
assertEquals(
DateFind
.findDates("Dátum: 2022.11.05", Language.Slovak)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 11, 5),
NerLabel("2022.11.05", NerTag.Date, 7, 17)
)
)
)
}
}
6 changes: 6 additions & 0 deletions modules/common/src/main/scala/docspell/common/Language.scala
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ object Language {
val iso3 = "ukr"
}

case object Slovak extends Language {
val iso2 = "sk"
val iso3 = "svk"
}

val all: List[Language] =
List(
German,
Expand All @@ -172,6 +177,7 @@ object Language {
Polish,
Estonian,
Ukrainian,
Slovak,
Khmer
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,5 +207,6 @@ object FtsRepository extends DoobieMeta {
case Language.Estonian => "simple"
case Language.Ukrainian => "simple"
case Language.Khmer => "simple"
case Language.Slovak => "simple"
}
}
24 changes: 23 additions & 1 deletion modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,18 @@ object SolrSetup {
"Add Khmer",
addContentField(Language.Khmer)
),
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer")
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer"),
SolrMigration[F](
35,
"Add new field type for slovak content",
addFieldType(AddFieldType.textSvk)
),
SolrMigration[F](
36,
"Add Slovak",
addContentField(Language.Slovak)
),
SolrMigration.reIndexAll(37, "Re-Index after adding Slovak")
)

def addFolderField: F[Unit] =
Expand Down Expand Up @@ -368,6 +379,17 @@ object SolrSetup {
)
)

val textSvk = AddFieldType(
"text_sk",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)

final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
Expand Down
8 changes: 8 additions & 0 deletions modules/webapp/src/main/elm/Data/Language.elm
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type Language
| Estonian
| Ukrainian
| Khmer
| Slovak


fromString : String -> Maybe Language
Expand Down Expand Up @@ -110,6 +111,9 @@ fromString str =
else if str == "khm" || str == "kh" || str == "khmer" then
Just Khmer

else if str == "svk" || str == "sk" || str == "slovak" then
Just Slovak

else
Nothing

Expand Down Expand Up @@ -186,6 +190,9 @@ toIso3 lang =
Khmer ->
"khm"

Slovak ->
"svk"


all : List Language
all =
Expand All @@ -212,4 +219,5 @@ all =
, Estonian
, Ukrainian
, Khmer
, Slovak
]
9 changes: 9 additions & 0 deletions modules/webapp/src/main/elm/Messages/Data/Language.elm
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ gb lang =
Khmer ->
"Khmer"

Slovak ->
"Slovak"


de : Language -> String
de lang =
Expand Down Expand Up @@ -159,6 +162,9 @@ de lang =
Khmer ->
"Khmer"

Slovak ->
"Slowakisch"


fr : Language -> String
fr lang =
Expand Down Expand Up @@ -231,3 +237,6 @@ fr lang =

Khmer ->
"Khmer"

Slovak ->
"Slovaquie"

0 comments on commit b02a5c2

Please sign in to comment.