-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathPirinen-2023-nodalida.html
1203 lines (1174 loc) · 88.6 KB
/
Pirinen-2023-nodalida.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html><html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>GiellaLT — a stable infrastructure for Nordic minority languages and beyond</title>
<!--Generated on Fri May 26 16:53:06 2023 by LaTeXML (version 0.8.6) http://dlmf.nist.gov/LaTeXML/.-->
<!--Document created on May 26, 2023.-->
<link rel="stylesheet" href="../latexml/LaTeXML.css" type="text/css">
<link rel="stylesheet" href="../latexml/ltx-article.css" type="text/css">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
</head>
<body>
<div class="ltx_page_main">
<div class="ltx_page_content">
<article class="ltx_document ltx_authors_1line">
<h1 class="ltx_title ltx_title_document">GiellaLT — a stable infrastructure for Nordic minority languages and
beyond</h1>
<div class="ltx_authors">
<span class="ltx_creator ltx_role_author">
<span class="ltx_personname">Flammie A Pirinen
</span></span>
<span class="ltx_author_before"> </span><span class="ltx_creator ltx_role_author">
<span class="ltx_personname">Sjur N. Moshagen
</span></span>
<span class="ltx_author_before"> </span><span class="ltx_creator ltx_role_author">
<span class="ltx_personname">Katri Hiovain-Asikainen
<br class="ltx_break">Divvun, Department of Language and Culture
<br class="ltx_break">UiT Norgga árktalaš universitehta
<br class="ltx_break">Tromsø, Norway
<br class="ltx_break"><a href="[email protected]" title="" class="ltx_ref ltx_url ltx_font_typewriter">[email protected]</a>
<br class="ltx_break"><a href="[email protected]" title="" class="ltx_ref ltx_url ltx_font_typewriter">[email protected]</a>
<br class="ltx_break"><a href="[email protected]" title="" class="ltx_ref ltx_url ltx_font_typewriter">[email protected]</a>
</span></span>
</div>
<div class="ltx_dates">(May 26, 2023)</div>
<div class="ltx_abstract">
<h6 class="ltx_title ltx_title_abstract">Abstract</h6>
<p class="ltx_p">Long term language technology infrastructures are critical for continued
maintenance of language technology based software that is used to support
the use of languages in the digital world. In the Nordic area we have
languages ranging from well-resourced national majority languages like
Norwegian, Swedish and Finnish as well as minoritised, unresourced and
indigenous languages like the Sámi languages. We present an infrastructure
that has been built in over 20 years time that supports building language
technology and tools for most of the Nordic languages as well as many of the
languages all over the world, with focus on Sámi and other indigenous,
minoritised and unresourced languages. We show that one common
infrastructure can be used to build tools from keyboards and spell-checkers
to machine translators, grammar checkers and text-to-speech as well as
automatic speech recognition.</p>
</div>
<section id="S1" class="ltx_section">
<h2 class="ltx_title ltx_title_section">
<span class="ltx_tag ltx_tag_section">1 </span>Introduction</h2>
<div id="S1.p1" class="ltx_para">
<p class="ltx_p">Language technology infrastructures are needed for long-term maintenance of
linguistic data and NLP applications derived from it. Specifically in a Nordic
context, we have a selection of languages with very different requirements, and
all differ from those that are commonly assumed in other NLP contexts, e.g.
English and handful of most resourced languages in the world. The languages in
the Nordic area range from decently resourced Indo-European languages (Norwegian
bokmål, Swedish, Danish and Icelandic) to moderately resourced Uralic languages
(Finnish, Estonian) to all low and unresourced, minoritised languages (Sámi
languages, all other Uralic languages, Faroese, Greenlandic). We have an
infrastructure that supports all of these languages, with a focus on the smaller
and less resourced languages and specifically on the Sámi languages. The
infrastructure we provide has been in use for over a decade and in this article
we describe strategies and workflows that we have found successful. It
currently supports over 100 languages, many outside of the Nordic region.</p>
</div>
<div id="S1.p2" class="ltx_para">
<p class="ltx_p">The technical infrastructure builds on the concept that we aim to separate the
technological work: programming and engineering, from the linguistic work:
lexicography, grammar building, corpus annotation etc. In this way, we enable
linguists and native informants to work on the language data and the engineers
build and maintain the technological solutions in a meaningful way where both
the technological solutions and the linguistic data are kept up to date and
functional. This workflow is important since both linguistic and technological
sides present ongoing challenges to be kept up to date. Regarding the
linguistic content, the language norms change and grow, new words and
expressions enter the lexicon regularly and other words and expressions become
outdated. In technology, operating systems and environments, programming
languages and APIs change all the time, making the NLP tools built a few years
ago not usable a few years later. The research question we solve with our
infrastructure is, how both parts can be kept up to date while not burdening the
people working with the parts with details irrelevant for their work.</p>
</div>
<div id="S1.p3" class="ltx_para">
<p class="ltx_p">In other words, the infrastructure contains linguistic data, and technological
implementations to build end user NLP-based tools and software from it. The
tools that we build nowadays include writing tools, such as spelling and grammar
checkers and correctors, speech synthesis and recognition, machine translation,
intelligent dictionaries and various linguistic analysis tools. The
technological infrastructure is composed of tools like version control systems,
build systems and automation of building and distribution of the NLP tools. The
underlying technologies here have changed a lot in the past 20 years, and will
undoubtedly keep evolving. In this article we take a look on some concepts that
have both stayed stable or evolved to be part of the core tools for us. In the
NLP scene, the world has changed a lot in past years as well, with the
traditional knowledge-based methodology being gradually replaced by data-driven
approaches; in the GiellaLT infrastructure we are still following the
expert-driven knowledge-based approach as it continues to be the most
appropriate for unresourced languages, but we do not cover this dichotomy in
detail; for more details of this we refer to <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib332" title="Unmasking the myth of effortless big data - making an open source multi-lingual infrastructure and building language resources from scratch" class="ltx_ref">16</a>]</cite> that
discusses the issue extensively.</p>
</div>
<div id="S1.p4" class="ltx_para">
<p class="ltx_p">In the past 20 years we have built language resources for several Sámi languages
starting from virtually nothing; Even though we had a number of non-digital
resources available, these were far from exhaustive. This means that our work
also included normative discussions, requests and suggestions to the language
normative organs, error classifications, and grammatical descriptions of
phenomena not included in grammar books. In several cases, these phenomena
needed traditional linguistic research. Based on this experience we suggest
workflows and usage patterns along the technical solutions of the infrastructure
that are effective for long term maintenance of linguistic software in support
of continued digital existence of human languages.</p>
</div>
<div id="S1.p5" class="ltx_para">
<p class="ltx_p">The contributions of this article are: We present a stable Nordic language
technology infrastructure that has supported Nordic language technology
development for 20 years, we describe the best current practices we have learned
in the years and based on the current state of things we sketch the potential
future developments.</p>
</div>
</section>
<section id="S2" class="ltx_section">
<h2 class="ltx_title ltx_title_section">
<span class="ltx_tag ltx_tag_section">2 </span>Background</h2>
<div id="S2.p1" class="ltx_para">
<p class="ltx_p">The infrastructure presented in this article has been developed and maintained
for at least 20 years now. The infrastrucutre has been discussed previously in
Nodalida some 10 years ago <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib197" title="Building an open-source development infrastructure for language technology projects" class="ltx_ref">13</a>]</cite>. In this work we aim
to show updates and prove that the system has well stood the test of time in
supporting Nordic languages. On one hand everything has changed between the
years; computers and mobile platforms, operating systems, programming
environments, on the other hand, many solutions have stayed usable: rule-based
finite state morphologies, dictionaries and linguistic data.</p>
</div>
<div id="S2.p2" class="ltx_para">
<p class="ltx_p">The foundation for the work presented in this article is the multilingual
infrastructure <a href="https://github.com/giellalt" title="" class="ltx_ref ltx_href ltx_font_italic">GiellaLT</a>, which
includes over 100 languages, including most nordic ones: the Sámi languages,
Faroese, Finnish, Norwegian, Swedish, other Uralic languages and many more.
Everything produced in the <span class="ltx_text ltx_font_italic">GiellaLT</span> infrastructure is under free and
open licences and freely available. The corpora are available with free
licensing where possible. The infrastructure is split code-wise in three GitHub
organisations: <a href="https://github.com/giellalt" title="" class="ltx_ref ltx_href ltx_font_italic">GiellaLT</a> containing
the language data for each language,
<a href="https://github.com/divvun" title="" class="ltx_ref ltx_href ltx_font_italic">Divvun</a> containing language
independent code for the infrastructure and various applications, and
<a href="https://github.com/giellatekno" title="" class="ltx_ref ltx_href ltx_font_italic">Giellatekno</a> for corpus
infrastructure. End user tools served by the Divvun group are at
<a href="https://divvun.no" title="" class="ltx_ref ltx_href ltx_font_italic">divvun.no</a> &
<a href="https://divvun.org" title="" class="ltx_ref ltx_href ltx_font_italic">divvun.org</a>, and tools served by the
Giellatekno group at
<a href="https://giellatekno.uit.no" title="" class="ltx_ref ltx_href ltx_font_italic">giellatekno.uit.no</a>, both at
<span class="ltx_text ltx_font_italic">UiT Norway’s Arctic University</span>.</p>
</div>
<div id="S2.p3" class="ltx_para">
<p class="ltx_p">We build systems that include lexical data as well as rules governing
morphophonology, syntax and semantics as well as a number of application
specific information, e.g. grammatical rules for grammar checking, phonetic
rules for <span class="ltx_text ltx_font_italic">Text-To-Speech</span> (TTS) and so forth.</p>
</div>
<div id="S2.p4" class="ltx_para">
<p class="ltx_p">The language-independent work is currently done within the infrastructure, the
language-independent features and updates that are relevant to all languages are
semi-automatically merged as they are developed. To ensure that language
independent and common features and updates do not destroy existing language
data or use case, we enforce a rigorous continuous integration based testing
regime. The current system for testing is a combination of our long-term
investment in testing within the infrastructure locally for
developers—combined with modern automatic testing currently supplied by
<a href="https://github.com/divvun/actions" title="" class="ltx_ref ltx_href">GitHub actions</a>.</p>
</div>
<div id="S2.p5" class="ltx_para">
<p class="ltx_p">The automated testing and integration is one of the key features for upkeep and
maintenance of the linguistic data: the linguists work with the dictionaries and
rules on a daily basis and receive immediate feedback from the system of the
effects of the new word entries or rules. The testing system verifies that if
the new words and rules did not affect negatively the user experience of
e.g. spelling checker, it can be immediately deployed to the end users of the
mobile keyboards and spell-checkers on office platforms.</p>
</div>
<div id="S2.p6" class="ltx_para">
<p class="ltx_p">Another part of the <span class="ltx_text ltx_font_italic">GiellaLT</span> philosophy is that of reusable and
multi-purposeful resources, cf. <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib11" title="Reusing grammatical resources for new languages" class="ltx_ref">1</a>]</cite>. This is
true for all of our work, from corpus collection to cross-lingual cooperation.</p>
</div>
<section id="S2.SS1" class="ltx_subsection">
<h3 class="ltx_title ltx_title_subsection">
<span class="ltx_tag ltx_tag_subsection">2.1 </span>Tools</h3>
<div id="S2.SS1.p1" class="ltx_para">
<p class="ltx_p">One of the main aims of the infrastructure is to provide tools to different end
user groups: language communities, learners, language users and researchers. In
2012, spell-checking and correction was presented as one of the key technologies
that language technology infrastructures can provide as a support tool for
linguistic communities. This continues to be a core tool but even it has
changed significantly: in 2012, the main use of spelling checkers was most
commonly seen as a writer’s tool within office suites. While this still is the
case, the users will much more likely face spelling correctors as part of
e.g. mobile keyboards, in form of automatic corrections.</p>
</div>
<div id="S2.SS1.p2" class="ltx_para">
<p class="ltx_p">The GiellaLT infrastructure today offer keyboards for many of the languages in
the infra for most mobile and computer operating systems. For writer’s tools,
we also provide more advanced grammatical error correction for some of the
languages. This is a tool that in practice concerns sentence level data while
correcting errors, whereas spelling checker typically processes at word level
mainly. Intelligent dictionaries and corpus resources are provided to users
primarily via web apps and related mobile apps. The intelligent dictionaries
are an important tool for language learners and users, they enable users to
understand texts by looking up the underlying lemma of inflected forms. For
research uses as well as for language learners and users to some extent, we also
have annotated corpora that can be used for example through a <span class="ltx_text ltx_font_italic">Korp</span>
corpus webapp. <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib47" title="Korp-the corpus infrastructure of spräkbanken." class="ltx_ref">3</a>]</cite> Spoken language technology is one of the
newer applications in our infrastructure. This encompasses text-to-speech as
well as automatic speech recognition.</p>
</div>
<div id="S2.SS1.p3" class="ltx_para">
<p class="ltx_p">An overview of the tools available for the languages listed later in the article
is given in table <a href="#S2.T1" title="Table 1 ‣ 2.1 Tools ‣ 2 Background ‣ GiellaLT — a stable infrastructure for Nordic minority languages and beyond" class="ltx_ref"><span class="ltx_text ltx_ref_tag"><span class="ltx_text" style="font-size:90%;">1</span></span></a>.</p>
</div>
<figure id="S2.T1" class="ltx_table">
<table class="ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle">
<thead class="ltx_thead">
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">Language</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">KBD</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">SP</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">GC</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">MT</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">Dict</span></th>
</tr>
</thead>
<tbody class="ltx_tbody">
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left ltx_border_tt"><span class="ltx_text" style="font-size:90%;">Eastern Mari</span></td>
<td class="ltx_td ltx_align_center ltx_border_tt"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">B</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Erzya</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Faroese</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Finnish</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Greenlandic</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Inari Sámi</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Ingrian</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Komi-Zyrian</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Kven</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Livvi</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Lule Sámi</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Moksha</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">North Sámi</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Norw. bokmål</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Norw. Nynorsk</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Pite Sámi</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Skolt Sámi</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">South Sámi</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Udmurt</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Voru</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
<tr class="ltx_tr">
<td class="ltx_td ltx_align_left"><span class="ltx_text" style="font-size:90%;">Western Mari</span></td>
<td class="ltx_td ltx_align_center"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">B</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">V</span></td>
</tr>
</tbody>
</table>
<figcaption class="ltx_caption ltx_centering" style="font-size:90%;"><span class="ltx_tag ltx_tag_table">Table 1: </span>Tools available for some of the languages in the GiellaLT
infrastructure. KBD = Keyboards, SP = spellers, CG = Grammar checker,
MT = machine translation, Dict = electronic dictionaries.
V = released, B = prerelease.</figcaption>
</figure>
</section>
<section id="S2.SS2" class="ltx_subsection">
<h3 class="ltx_title ltx_title_subsection">
<span class="ltx_tag ltx_tag_subsection">2.2 </span>Methods</h3>
<div id="S2.SS2.p1" class="ltx_para">
<p class="ltx_p">The foundation for all linguistic processing in the <span class="ltx_text ltx_font_italic">GiellaLT</span>
infrastructure is the morphological analyser, built using formalisms from Xerox:
<span class="ltx_text ltx_font_typewriter">lexc</span>, <span class="ltx_text ltx_font_typewriter">xfst</span> and optionally <span class="ltx_text ltx_font_typewriter">twolc</span>. From these source
files, the infrastructure creates ¸<span class="ltx_text ltx_font_italic">finite state transducers</span> (FST’s)
using one of three supported FST compilers: Xerox
tools <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib30" title="Finite state morphology" class="ltx_ref">2</a>]</cite>,
<a href="https://hfst.github.io" title="" class="ltx_ref ltx_href ltx_font_italic">HFST</a> <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib160" title="Hfst—a system for creating nlp tools" class="ltx_ref">12</a>]</cite>, or
Foma <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib111" title="Foma: a finite-state compiler and library" class="ltx_ref">6</a>]</cite>. All higher-order linguistic processing is done
using the VISLCG3 (<a href="http://visl.sdu.dk" title="" class="ltx_ref ltx_href ltx_font_italic">visl.sdu.dk</a>)
implementation <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib72" title="Constraint grammar manual: 3rd version of the CG formalism variant" class="ltx_ref">4</a>]</cite> of Constraint
Grammar <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib125" title="Constraint grammar as a framework for parsing unrestricted text" class="ltx_ref">9</a>]</cite>. Tokenisation is based on an FST model
initially presented by <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib129" title="Beyond morphology: pattern matching with fst" class="ltx_ref">10</a>]</cite> in the Xerox tool
<span class="ltx_text ltx_font_typewriter">pmatch</span>. The resulting FST is applied using <span class="ltx_text ltx_font_typewriter">hfst-tokenise</span>. In
our tokenisation, sentence boundary detection is treated as a special case of
ambiguous tokenisation, and solved in the same way, approaching near-perfect
sentence boundary identification, cf. <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib327" title="Seeing more than whitespace — tokenisation and disambiguation in a North Sámi grammar checker" class="ltx_ref">18</a>]</cite>.</p>
</div>
<div id="S2.SS2.p2" class="ltx_para">
<p class="ltx_p">Spell-checkers are based on weighted finite-state technology as described
by <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib238" title="State-of-the-art in weighted finite-state spell-checking" class="ltx_ref">14</a>]</cite>. There is also support for neural network based
models of spell-checking <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib121" title="You can’t suggest that?!: comparisons and improvements of speller error models" class="ltx_ref">8</a>]</cite>, this is however in its current
stage still not up to par with the traditional weighted finite-state models
given the current error corpus sizes. Since 2019 the <span class="ltx_text ltx_font_italic">GiellaLT</span>
infrastructure supports building grammar checkers <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib328" title="Many shades of grammar checking – launching a constraint grammar tool for North Sámi" class="ltx_ref">17</a>]</cite> and
these are available for some of the Sámi languages already. Another high-level
tool available within the <span class="ltx_text ltx_font_italic">GiellaLT</span> infrastructure is machine
translation. It works in cooperation with the
<a href="https://github.com/apertium" title="" class="ltx_ref ltx_href ltx_font_italic">Apertium</a>
infrastructure <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib135" title="Recent advances in Apertium, a free/open-source rule-based machine translation platform for low-resource languages" class="ltx_ref">11</a>]</cite>.</p>
</div>
<div id="S2.SS2.p3" class="ltx_para">
<p class="ltx_p">Speech technology is based on a combination of the knowledge-based methods and
data-driven methods. For this reason we have started developing workflows and
best practices for gathering good spoken data for minoritised and less resourced
language scenarios we work with.
</p>
</div>
<div id="S2.SS2.p4" class="ltx_para">
<p class="ltx_p">The engineering solutions we use to tie together the linguistic work and the
technological work follow the contemporary approaches to <span class="ltx_text ltx_font_italic">continuous
integration and deployment</span>, which at the moment is implemented on
<span class="ltx_text ltx_font_italic">GitHub</span> systems including GitHub Actions as well as on some custom-built
continuous integration systems based on
<a href="https://taskcluster.net" title="" class="ltx_ref ltx_href ltx_font_italic">Tascluster</a>. The continuous integration
tools are used both in the traditional way as in software engineering, to ensure
that the new additions to code and data did not fundamentally break the system
(e.g. with syntax errors) as well as ensuring the quality of the systems after
the change. The quality assurance aspect is based on automated testing of
evaluation factors that are both relevant for the products as well as
interesting for research and development, e.g. for spell-checkers we test and
track the development of <span class="ltx_text ltx_font_italic">precision and recall</span> of the system over time.</p>
</div>
</section>
</section>
<section id="S3" class="ltx_section">
<h2 class="ltx_title ltx_title_section">
<span class="ltx_tag ltx_tag_section">3 </span>Linguistic data</h2>
<div id="S3.p1" class="ltx_para">
<p class="ltx_p">There are two types of linguistic data we gather and develop in the
infrastructure, one is the dictionaries, grammars and descriptions for each
language and the other is corpus data. Even if our system is not corpus-driven
in the way most other contemporary systems are, once we develop the
knowledge-based systems we are working for, the real-world data from language
users becomes a very important resource for testing and evaluating the systems
we have built. The corpus data we collect is also enriched by language experts
by annotating spelling and grammar errors with corrections included, or by doing
other linguistic annotations and corrections to automated annotations. For this
reason and also because we work with many languages that have very little data
available the corpora we collect are carefully selected and curated.</p>
</div>
<div id="S3.p2" class="ltx_para">
<p class="ltx_p">The linguistic data can be roughly evaluated without annotated large manually
annotated gold corpora by calculating the number of words in the dictionaries
and a <span class="ltx_text ltx_font_italic">naïve coverage</span>. Words counted are lemma entries, thus words
covered by productive morphology will not be included in the
figure.<span id="footnote1" class="ltx_note ltx_role_footnote"><sup class="ltx_note_mark">1</sup><span class="ltx_note_outer"><span class="ltx_note_content"><sup class="ltx_note_mark">1</sup>
<span class="ltx_tag ltx_tag_note">1</span>
Natural language productive morphology in complex morphologies
we work with is usually cyclical, so theoretic word count for derived and
compounded forms of all languages is infinite.</span></span></span> The naíve coverage will give an
intuition for the extents of the derivational morphology has with regards to
real world word-form usage. Here naïve coverage is calculated as a proportion
of tokens that get any analyses of the whole corpus, in this case we use the
tokenisation provided by the corpus analysis tools, which is based on
left-to-right longest match tokenisation that falls back on space-separated
tokens with special cases for punctuation, i.e. mostly natural tokenisation for
the western languages with latin and cyrillic
scripts. <span id="footnote2" class="ltx_note ltx_role_footnote"><sup class="ltx_note_mark">2</sup><span class="ltx_note_outer"><span class="ltx_note_content"><sup class="ltx_note_mark">2</sup>
<span class="ltx_tag ltx_tag_note">2</span>
c.f. <a href="https://github.com/giellalt/giella-core/blob/master/scripts/coverage-etc.bash" title="" class="ltx_ref ltx_url ltx_font_typewriter">https://github.com/giellalt/giella-core/blob/master/scripts/coverage-etc.bash</a></span></span></span>
The figures are given in table <a href="#S3.T2" title="Table 2 ‣ 3 Linguistic data ‣ GiellaLT — a stable infrastructure for Nordic minority languages and beyond" class="ltx_ref"><span class="ltx_text ltx_ref_tag"><span class="ltx_text" style="font-size:90%;">2</span></span></a>.</p>
</div>
<figure id="S3.T2" class="ltx_table">
<table class="ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle">
<thead class="ltx_thead">
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row"><span class="ltx_text" style="font-size:90%;">Language</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_column ltx_th_row"><span class="ltx_text" style="font-size:90%;">ISO</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">Words</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">Coverage</span></th>
</tr>
</thead>
<tbody class="ltx_tbody">
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_tt"><span class="ltx_text" style="font-size:90%;">Eastern Mari</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row ltx_border_tt"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">mhr</span></th>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">55 k</span></td>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">87 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Erzya</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">myv</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">102 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Faroese</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">fao</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">72 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">94 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Finnish</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">fin</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">412 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">95 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Greenlandic</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">kal</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">12 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">59 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Inari Sámi†</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">smn</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">77 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">91 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Ingrian</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">izh</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">2 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Komi-Zyrian</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">kpv</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">195 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">99 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Kven</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">fkv</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">16 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">75 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Livvi</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">olo</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">58 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Lule Sámi†</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">smj</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">76 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">93 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Moksha</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">mdf</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">41 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">North Sámi†</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sme</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">164 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">91 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Norw. Bokmål</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">nob</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">54 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">95 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Pite Sámi†</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sje</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">5 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">100 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Skolt Sámi†</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sms</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">66 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">82 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">South Sámi†</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sma</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">86 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">84 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Udmurt</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">udm</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">47 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Voru</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">vro</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">20 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">90 %</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Western Mari</span></th>
<th class="ltx_td ltx_align_center ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">mrj</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">26 k</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
</tbody>
</table>
<figcaption class="ltx_caption ltx_centering" style="font-size:90%;"><span class="ltx_tag ltx_tag_table">Table 2: </span>Dictionary sizes and coverage for a number of languages in the
<span class="ltx_text ltx_font_italic">GiellaLT</span> infrastructure; ISO codes are ISO 639–3.
<br class="ltx_break">† The figures
for some of the Sámi language word counts include 33.5 k proper names in a
shared file.</figcaption>
</figure>
<div id="S3.p3" class="ltx_para">
<p class="ltx_p">It is noteworthy that the naïve coverages we count are based on the corpora we
have collected and this corpora has been seen by people working on the
dictionaries, in other words it is technically not a clean test setup. For many
of the languages we work with this is necessitated by the facts that the corpus
we have is all texts that are available for the language at all. Not making full
use of it would hinder the development of the language model in a way that would
be more valuable</p>
</div>
<div id="S3.p4" class="ltx_para">
<p class="ltx_p">for the language communities than to hide parts of the corpus from the
lexicographers for testing purposes.</p>
</div>
<div id="S3.p5" class="ltx_para">
<p class="ltx_p">For this reason the figures should be considered as a rough guideline, as naïve
coverage would be anyways. For our intents and purposes, we can see from the
naïve coverage if the dictionaries need attention e.g., for spell-checkers to be
usable enough as to not show too many red underlines in regular everyday texts.</p>
</div>
<div id="S3.p6" class="ltx_para">
<p class="ltx_p">We collect texts for the Nordic languages as well as several other languages
that we use and develop. The largest corpora we have harvested are for the Sámi
languages: North, Lule, South, Inari and Skolt Sámi. The Sámi corpus is owned
by the Norwegian Sámi parliament, and all corpora are administered and made
accessible to the public by the Divvun and Giellatekno groups. The corpora for
some of the Uralic languages in Russia are large, and for Meadow Mari even
larger than for North Sámi. Some of the corpora for larger, non-minority
languages (e.g. Finnish, Norwegian) are moderately sized, since they are already
covered by other projects such as OPUS <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib215" title="Parallel data, tools and interfaces in OPUS" class="ltx_ref">15</a>]</cite>, and we only need to create
specific corpora for our applications, such as grammar error corpora by L2
speakers in order to develop a grammar checker.</p>
</div>
<div id="S3.p7" class="ltx_para">
<p class="ltx_p">The corpora are split in two based on restrictions set by the copyright owners.
Researchers and anyone else can freely download the free part. The whole corpus,
also the restricted part, is accessible via a public search
interface<span id="footnote3" class="ltx_note ltx_role_footnote"><sup class="ltx_note_mark">3</sup><span class="ltx_note_outer"><span class="ltx_note_content"><sup class="ltx_note_mark">3</sup>
<span class="ltx_tag ltx_tag_note">3</span>
<a href="https://gtweb.uit.no/korp" title="" class="ltx_ref ltx_href">gtweb.uit.no/korp</a> (Sámi),
<a href="https://gtweb.uit.no/f_korp" title="" class="ltx_ref ltx_href">gtweb.uit.no/f_korp</a> (Baltic Finnic and
Faroese), <a href="https://gtweb.uit.no/u_korp" title="" class="ltx_ref ltx_href">gtweb.uit.no/u_korp</a> (other Uralic
languages). Cf. also
<a href="https://giellalt.github.io/ling/corpus_repositories.html" title="" class="ltx_ref ltx_href">More info about
the corpora.</a></span></span></span>. We have written a tool named
<a href="https://github.com/giellalt/CorpusTools" title="" class="ltx_ref ltx_href">CorpusTools</a> to administer,
convert and analyse the corpus texts. Original texts and their metadata are
saved in GitHub repositories, then converted to a common XML format, to ease
further use of the texts. The sizes of corpora are summarised in
table <a href="#S3.T3" title="Table 3 ‣ 3 Linguistic data ‣ GiellaLT — a stable infrastructure for Nordic minority languages and beyond" class="ltx_ref"><span class="ltx_text ltx_ref_tag"><span class="ltx_text" style="font-size:90%;">3</span></span></a>, the token count is based on simple space-separated
tokens with no extra tokenisation.<span id="footnote4" class="ltx_note ltx_role_footnote"><sup class="ltx_note_mark">4</sup><span class="ltx_note_outer"><span class="ltx_note_content"><sup class="ltx_note_mark">4</sup>
<span class="ltx_tag ltx_tag_note">4</span>
The corpora are being constantly
harvested, the status as of 2023–02–03 is shown, the current status will be
available in our GitHub repositories in the near future.</span></span></span> The languages shown in
the table are the Nordic and related languages, for a full listing refer to our
website<span id="footnote5" class="ltx_note ltx_role_footnote"><sup class="ltx_note_mark">5</sup><span class="ltx_note_outer"><span class="ltx_note_content"><sup class="ltx_note_mark">5</sup>
<span class="ltx_tag ltx_tag_note">5</span>
<a href="https://giellalt.github.io/" title="" class="ltx_ref ltx_url ltx_font_typewriter">https://giellalt.github.io/</a></span></span></span>. The corpus texts have some
metadata and markups relevant for our use cases, such as grammar checking and
correction.</p>
</div>
<div id="S3.p8" class="ltx_para">
<p class="ltx_p">Recently, we have also began collecting speech corpora for speech technology
related projects. For example, for an ongoing Lule Sámi TTS project we reused
a part of a Lule Sámi gold corpus from 2013, and collected additional texts we
knew to be well written and already proofread, before proofreading these texts
once more to avoid confusion when reading the text aloud during the TTS
recordings. The Lule Sámi TTS text corpus consists of various text styles
(news, educational, parliament etc.) with altogether over 74,000 words.</p>
</div>
<div id="S3.p9" class="ltx_para">
<p class="ltx_p">Currently, we have recorded two Lule Sámi voice talents using this text corpus,
and after processing the recordings, a speech corpus with altogether 20 hours
will be ready to use for speech technology purposes.</p>
</div>
<figure id="S3.T3" class="ltx_table">
<table class="ltx_tabular ltx_centering ltx_guessed_headers ltx_align_middle">
<thead class="ltx_thead">
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row"><span class="ltx_text" style="font-size:90%;">Language</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_column ltx_th_row"><span class="ltx_text" style="font-size:90%;">ISO</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">Tokens</span></th>
<th class="ltx_td ltx_align_right ltx_th ltx_th_column"><span class="ltx_text" style="font-size:90%;">Speech</span></th>
</tr>
</thead>
<tbody class="ltx_tbody">
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_tt"><span class="ltx_text" style="font-size:90%;">Eastern Mari</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row ltx_border_tt"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">mhr</span></th>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">57 M</span></td>
<td class="ltx_td ltx_align_right ltx_border_tt"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Erzya</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">myv</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">14 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Faroese</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">fao</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">11 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Finnish</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">fin</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">2 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Greenlandic</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">kal</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">0.5 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Inari Sámi</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">smn</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">3 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Ingrian</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">izh</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Komi-Zyrian</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">kpv</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">1 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Kven</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">fkv</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">0.5 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Livvi</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">olo</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">0.3 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Lule Sámi</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">smj</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">2 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">20 h</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Moksha</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">mdf</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">13 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">North Sámi</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sme</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">39 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">38 h</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Norw. bokmål</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">nob</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">14 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Norw. Nynorsk</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">nno</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">0.7 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Pite Sámi</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sje</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Skolt Sámi</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sms</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">0.25 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">South Sámi</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">sma</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">2 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Udmurt</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">udm</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Voru</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">vro</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">0.67 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
<tr class="ltx_tr">
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text" style="font-size:90%;">Western Mari</span></th>
<th class="ltx_td ltx_align_left ltx_th ltx_th_row"><span class="ltx_text ltx_font_typewriter" style="font-size:90%;">mrj</span></th>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">6 M</span></td>
<td class="ltx_td ltx_align_right"><span class="ltx_text" style="font-size:90%;">—</span></td>
</tr>
</tbody>
</table>
<figcaption class="ltx_caption ltx_centering" style="font-size:90%;"><span class="ltx_tag ltx_tag_table">Table 3: </span>Corpus sizes for some of the languages in our infrastructure.
Tokens are space-separated tokens.</figcaption>
</figure>
<div id="S3.p10" class="ltx_para">
<p class="ltx_p">As spoken language technology is based on data and machine learning, the
procedures and pipelines described above could be applied to any (minority)
language with a low-resource setting, in the task of developing speech
technology applications. Most of the applications discussed here can be piloted
with or further developed with relatively small data sets (even with ¡ 5 hrs of
paired data), compared to the amounts of data used for respective tools for
majority languages (see, e.g., <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib117" title="The LJ speech dataset. 2017" class="ltx_ref">7</a>]</cite><span id="footnote6" class="ltx_note ltx_role_footnote"><sup class="ltx_note_mark">6</sup><span class="ltx_note_outer"><span class="ltx_note_content"><sup class="ltx_note_mark">6</sup>
<span class="ltx_tag ltx_tag_note">6</span>
The LJ Speech
dataset consists of 13,100 short audio clips of a single English speaker with a
total length of approximately 24 hours.</span></span></span>). This is largely possible thanks to
the available open source materials and technologies, especially those relying
on, e.g., <span class="ltx_text ltx_font_italic">transfer learning</span>, i.e. fine-tuning of
models <cite class="ltx_cite ltx_citemacro_cite">[<a href="#bib.bib80" title="Towards transfer learning for end-to-end speech synthesis from deep pre-trained language models" class="ltx_ref">5</a>]</cite>.</p>
</div>
</section>
<section id="S4" class="ltx_section">
<h2 class="ltx_title ltx_title_section">
<span class="ltx_tag ltx_tag_section">4 </span>Conclusion</h2>
<div id="S4.p1" class="ltx_para">
<p class="ltx_p">In this article we have presented recent developments and status of the
<span class="ltx_text ltx_font_italic">GiellaLT</span> Nordic multilingual infrastructure built during the last 20
years. In the last years, we have added more support to speech technologies,
and keyboards for various platforms such as mobile devices and modern operating
systems.
</p>
</div>
<div id="S4.p2" class="ltx_para">
<p class="ltx_p">The <span class="ltx_text ltx_font_italic">GiellaLT</span> infrastructure contains building blocks and support for
most of the language technology needs of indigenous and minority languages, from
the very basic input technologies like keyboards to high-level advanced tools
like world-class grammar checking and machine translation. It does this by using
rule-based technologies that makes it possible for any language community to get
the language technology tools they want and need. All that is needed is a
linguist.</p>
</div>
<div id="S4.p3" class="ltx_para">
<p class="ltx_p">We discussed the ways for long-term maintenance of linguistic data and software
tools for NLP of Nordic and minority languages. We showed some best current
practices and workflows on how to maintain the lexicons and keep end user tools
unbroken and still up-to-date.</p>
</div>
<div id="S4.p4" class="ltx_para">
<p class="ltx_p">In conclusion, building corpora is based on big efforts, requires expertise and
is time-costly. We have illuminated the work behind three important steps
within building corpora—firstly, collecting and digitalising, secondly
upgrading, i.e. adding annotation for special purposes, and proofreading, and
thirdly converting from one medium/language to another as in recording speech,
translating, or other.</p>
</div>
<div id="S4.p5" class="ltx_para">
<p class="ltx_p">With our multilingual infrastructure and our language resources we show that
while there is a need for corpus data for certain tasks, high quality tools
needed by a language community can be built time-efficiently without big data in
a rule-based manner.</p>
</div>
</section>
<section id="bib" class="ltx_bibliography">
<h2 class="ltx_title ltx_title_bibliography">References</h2>
<ul id="bib.L1" class="ltx_biblist">
<li id="bib.bib11" class="ltx_bibitem ltx_bib_inproceedings">
<span class="ltx_tag ltx_bib_key ltx_role_refnum ltx_tag_bibitem">[1]</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_author">L. Antonsen, L. Wiechetek, and T. Trosterud</span><span class="ltx_text ltx_bib_year"> (2010)</span>
</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_title">Reusing grammatical resources for new languages</span>.
</span>
<span class="ltx_bibblock">In <span class="ltx_text ltx_bib_inbook">Proceedings of the 7th International Conference on Language Resources and Evaluation (LREC 2010)</span>,
</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_place">Stroudsburg</span>, <span class="ltx_text ltx_bib_pages"> pp. 2782–2789</span>.
</span>
<span class="ltx_bibblock ltx_bib_cited">Cited by: <a href="#S2.p6" title="2 Background ‣ GiellaLT — a stable infrastructure for Nordic minority languages and beyond" class="ltx_ref"><span class="ltx_text ltx_ref_tag">§2</span></a>.
</span>
</li>
<li id="bib.bib30" class="ltx_bibitem ltx_bib_book">
<span class="ltx_tag ltx_bib_key ltx_role_refnum ltx_tag_bibitem">[2]</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_author">K. R. Beesley and L. Karttunen</span><span class="ltx_text ltx_bib_year"> (2003)</span>
</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_title">Finite state morphology</span>.
</span>
<span class="ltx_bibblock"> <span class="ltx_text ltx_bib_publisher">CSLI publications</span>.
</span>
<span class="ltx_bibblock">External Links: <span class="ltx_text ltx_bib_links"><span class="ltx_text isbn ltx_bib_external">ISBN 978-1575864341</span></span>
</span>
<span class="ltx_bibblock ltx_bib_cited">Cited by: <a href="#S2.SS2.p1" title="2.2 Methods ‣ 2 Background ‣ GiellaLT — a stable infrastructure for Nordic minority languages and beyond" class="ltx_ref"><span class="ltx_text ltx_ref_tag">§2.2</span></a>.
</span>
</li>
<li id="bib.bib47" class="ltx_bibitem ltx_bib_inproceedings">
<span class="ltx_tag ltx_bib_key ltx_role_refnum ltx_tag_bibitem">[3]</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_author">L. Borin, M. Forsberg, and J. Roxendal</span><span class="ltx_text ltx_bib_year"> (2012)</span>
</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_title">Korp-the corpus infrastructure of spräkbanken.</span>.
</span>
<span class="ltx_bibblock">In <span class="ltx_text ltx_bib_inbook">LREC</span>,
</span>
<span class="ltx_bibblock">Vol. <span class="ltx_text ltx_bib_volume">2012</span>, <span class="ltx_text ltx_bib_pages"> pp. 474–478</span>.
</span>
<span class="ltx_bibblock ltx_bib_cited">Cited by: <a href="#S2.SS1.p2" title="2.1 Tools ‣ 2 Background ‣ GiellaLT — a stable infrastructure for Nordic minority languages and beyond" class="ltx_ref"><span class="ltx_text ltx_ref_tag">§2.1</span></a>.
</span>
</li>
<li id="bib.bib72" class="ltx_bibitem ltx_bib_manual">
<span class="ltx_tag ltx_bib_key ltx_role_refnum ltx_tag_bibitem">[4]</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_author">T. Didriksen</span><span class="ltx_text ltx_bib_year"> (2010)</span>
</span>
<span class="ltx_bibblock"><span class="ltx_text ltx_bib_title">Constraint grammar manual: 3rd version of the CG formalism variant</span>.
</span>
<span class="ltx_bibblock"> <span class="ltx_text ltx_bib_publisher">GrammarSoft ApS</span>, <span class="ltx_text ltx_bib_place">Denmark</span>.