From 69aa33e6d3e2a6ae3d838dee0a9769c1187e4c36 Mon Sep 17 00:00:00 2001 From: jakoble <37188634+jakoble@users.noreply.github.com> Date: Tue, 26 Sep 2023 12:06:08 +0200 Subject: [PATCH] Update 1-Reference corpora in the CLARIN infrastructure.csv --- ...e corpora in the CLARIN infrastructure.csv | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/resource_families/Corpora/Referenca corpora/1-Reference corpora in the CLARIN infrastructure.csv b/resource_families/Corpora/Referenca corpora/1-Reference corpora in the CLARIN infrastructure.csv index b367dcb..b780e0e 100644 --- a/resource_families/Corpora/Referenca corpora/1-Reference corpora in the CLARIN infrastructure.csv +++ b/resource_families/Corpora/Referenca corpora/1-Reference corpora in the CLARIN infrastructure.csv @@ -1,30 +1,32 @@ "Corpus";"Corpus_URL";"Language";"Size";"Annotation";"Licence";"Licence_URL";"Description";"Family";"Buttons";"Buttons_URL";"Publication";"Publication_URL";"Note" -"AbNC: Abkhaz National Corpus";"https://clarino.uib.no/abnc/page";"Abkhaz";"10 million words";"MSD-tagged, lemmatized";"CLARIN_PUB-BY-NC-ND";;"This corpus includes Abkhaz texts published between 1920 and 2016. The corpus is encoded in TEI.#SEPThe corpus is available for online browsing through the Corpuscle concordancer (CLARINO distribution).";"Referenca corpora";"Concordancer";"https://clarino.uib.no/abnc";"Meurer (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Meurer%202018"; -"Bulgarian National Reference Corpus (BNRC)";"http://webclark.org/?locale=en";"Bulgarian";"70 million tokens";"tokenized, PoS-tagged";"Individual terms of agreement";;"This corpus includes Bulgarian texts taken from news media, literature, and administrative documents between 1997 and 2002.#SEPThe tokenised corpus is available through WebCLaRK, while the PoS-tagged version is available only upon request.";"Referenca corpora";"Concordancer";"https://webclark.org/?locale=en";"Simov et al. (2004)";"https://www.clarin.eu/resource-families/reference-corpora#Kiril%20et%20al.%202004"; -"Croatian language corpus Riznica 0.1";"http://hdl.handle.net/11356/1180";"Croatian";"101.8 million tokens, 85.3 million words, 4.7 million sentences, 14,781 texts";"sentence segmented, PoS-tagged, lemmatized";"CC BY-NC-SA 4.0";;"This corpus includes Croatian texts taken from fiction (28%) and specialised texts (72%).#SEPThe corpus is available for online browsing via noSketch Engine and KonText and for download from the CLARIN.SI repository.";"Referenca corpora";"noSketchEngine#SEPKonText#SEPDownload";"https://www.clarin.si/noske/run.cgi/corp_info?corpname=riznica#SEPhttps://www.clarin.si/kontext/first_form?corpname=riznica#SEPhttp://hdl.handle.net/11356/1180";"Ćavar and Brozović Rončević (2012)";"https://www.clarin.eu/resource-families/reference-corpora#%C4%86avar%20and%20Brozovi%C4%87%20Ron%C4%8Devi%C4%87%202012"; -"Croatian National Corpus";"http://hdl.handle.net/11372/LRT-233";"Croatian";"101 million tokens";;;;"This corpus includes Croatian texts taken from newspapers, magazines, popular texts, and fiction.#SEPThe corpus is available for online browsing through the noSketch Engine.";"Referenca corpora";"Concordancer";"http://filip.ffzg.hr/cgi-bin/run.cgi/first_form";"Tadić (2002)";"https://www.clarin.eu/resource-families/reference-corpora#Tadi%C4%87%202002"; -"SYN2005: balanced corpus of written Czech";"http://hdl.handle.net/11858/00-097C-0000-0023-119E-8";"Czech";"100 million words";"MSD-tagged, lemmatized";"Czech National Corpus (Shuffled Corpus Data)";;"This corpus includes Czech texts published between 2000 and 2004. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.";"Referenca corpora";"Concordancer#SEPDownload";"https://kontext.korpus.cz/first_form?corpname=syn2005#SEPhttp://hdl.handle.net/11858/00-097C-0000-0023-119E-8";"Hnátková et al. (2014)";"https://www.clarin.eu/resource-families/reference-corpora#Hn%C3%A1tkov%C3%A1%20et%20al.%202014"; -"SYN2010: balanced corpus of written Czech";"http://hdl.handle.net/11858/00-097C-0000-0023-119F-6";"Czech";"100 million words";"MSD-tagged, lemmatized";"Czech National Corpus (Shuffled Corpus Data)";;"This corpus includes Czech fiction, professional literature, newspapers etc. published between 2005 and 2009. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.";"Referenca corpora";"Concordancer#SEPDownload";"https://kontext.korpus.cz/first_form?corpname=syn2010#SEPhttp://hdl.handle.net/11858/00-097C-0000-0023-119F-6";"Hnátková et al. (2014)";"https://www.clarin.eu/resource-families/reference-corpora#Hn%C3%A1tkov%C3%A1%20et%20al.%202014"; -"SYN2015: representative corpus of written Czech";"http://hdl.handle.net/11234/1-1593";"Czech";"100 million words";"MSD-tagged, lemmatized";"Czech National Corpus (Shuffled Corpus Data)";;"This corpus includes Czech fiction, professional literature, newspapers etc. published between 2010 and 2014. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.";"Referenca corpora";"Concordancer#SEPDownload";"https://kontext.korpus.cz/first_form?corpname=syn2015#SEPhttp://hdl.handle.net/11234/1-1593";"Hnátková et al. (2014)";"https://www.clarin.eu/resource-families/reference-corpora#Hn%C3%A1tkov%C3%A1%20et%20al.%202014"; -"DK-CLARIN Reference Corpus of General Danish";"http://hdl.handle.net/20.500.12115/36";"Danish";"45.1 million words";"PoS-tagged, sentence and paragraph segmentation, lemmatized";"CLARIN ACA-NC";;"This corpus includes Danish texts published between 2008 and 2011.#SEPThe corpus is encoded in TEI. Non-linguistic metadata includes information on source and year of publication.#SEPThe corpus is available for download from the CLARIN-DK repository.";"Referenca corpora";"Download";"http://hdl.handle.net/20.500.12115/36";;; -"SoNaR";"http://hdl.handle.net/10032/tm-a2-h5";"Dutch";"500 million words";"PoS-tagged, lemmatized, named entities; coreference annotation and annotation of spatial and temporal relations for the manually annotated SoNaR-1 subset ";"Terms of Agreement";;"This corpus includes representative Dutch texts (fiction, brochures, magazines, legal texts, newspapers, parliamentary proceedings, and computer-mediated communication).#SEPAside from written materials, the corpus also contains transcriptions of spoken language. The corpus is encoded in FoLiA.#SEPThe corpus is available for online browsing through the OpenSONAR concordancer and can be downloaded from the Dutch Language Institute (CLARIAH-NL).";"Referenca corpora";"Concordancer#SEPDownload subset 1#SEPDownload subset 2";"https://portal.clarin.nl/node/4195#SEPhttp://hdl.handle.net/10032/tm-a2-c7#SEPhttp://hdl.handle.net/10032/tm-a2-h5 ";;; -"Corpus of Contemporary American English – Kielipankki version";"http://urn.fi/urn:nbn:fi:lb-2019031901";"English (American)";"440 million words, 190,000 texts";"PoS-tagged, lemmatized";"CLARIN ACA (online version), CLARIN RES (downloadable version)";;"This corpus includes American English texts evenly divided into the spoken, fiction, magazine, newspaper, and academic genres (around 88 million words each) published between 1990 and 2012.#SEPThe corpus is available for download from the Finnish Language Bank as well as for online browsing through the concordancer Korp (FIN-CLARIN distribution).";"Referenca corpora";"Concordancer#SEPDownload";"http://urn.fi/urn:nbn:fi:lb-2017061933#SEPhttp://urn.fi/urn:nbn:fi:lb-2019031901";;; -"British National Corpus";"http://hdl.handle.net/20.500.12024/2554";"English (British)";"100 million words";"PoS-tagged, lemmatized";"BNC User Licence (restricted for the downloadable version)";;"This corpus includes English texts (fiction, magazines, newspapers, and academic writing) published between 1980 and 1993.#SEPThe corpus is encoded in TEI. Non-linguistic metadata include contextual and bibliographic information. Aside from written materials, the corpus also includes transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer and can be downloaded from the Oxford Text Archive (CLARIN-UK).";"Referenca corpora";"Concordancer#SEPDownload";"https://www.natcorp.ox.ac.uk/#SEPhttp://hdl.handle.net/20.500.12024/2554";;; -"Estonian National Corpus 2019";"https://hdl.handle.net/10.15155/3-00-0000-0000-0000-08489L";"Estonian";"1.5 billion words";"MSD-tagged, lemmatized";"CC-BY-SA";;"This corpus includes Estonian texts published between 1990 and 2019. Amongst others, this corpus contains the Estonian Reference Corpus as a subcorpus.#SEPThe corpus is available for download from META-SHARE (CELR distribution).";"Referenca corpora";"Download";"http://hdl.handle.net/10.15155/3-00-0000-0000-0000-08489L";;; -"Estonian Reference Corpus";"http://hdl.handle.net/11372/LRT-1068";"Estonian";"175 million words";"MSD-tagged, lemmatized";"free for non-commercial use";;"This corpus includes Estonian texts (fiction, PhD theses, newspapers, magazines, parliamentary transcriptions, computer-mediated communication) published between 1990 and 2007. The corpus is encoded in TEI.#SEPThe corpus is available for online browsing through a dedicated concordancer and is available for download from CELR.";"Referenca corpora";"Concordancer#SEPDownload";"https://www.keeleveeb.ee/#SEPhttp://www.cl.ut.ee/korpused/segakorpus/";;; -"DeReKo";"http://www1.ids-mannheim.de/kl/projekte/korpora/";"German";"31.7 billion words";"MSD-tagged, lemmatized";"CC-BY-SA";;"This corpus includes German texts in a wide variety of genres published from 1947 onwards. Non-linguistic metadata include rich bibliographic information and partial layout information.#SEPPart of the corpus is available for download from a dedicated webpage (CLARIN-D distribution), while the entire corpus can be queried online through the COSMAS II platform.";"Referenca corpora";"Concordancer#SEPDownload";"https://www1.ids-mannheim.de/kl/projekte/korpora/verfuegbarkeit.html#SEPhttp://www1.ids-mannheim.de/kl/projekte/korpora/verfuegbarkeit.html";"Kupietz et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Kupietz%20et%20al.%202018"; -"Corpus of Greek Texts";"http://hdl.grnet.gr/11500/UOA-0000-0000-2471-8";"Greek";"27.6 million words";;"CC-BY-NC, ACA";;"This corpus includes representative Greek texts published between 1990 and 2010. Aside from written materials, the corpus also includes transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Referenca corpora";"Concordancer";"https://www.sek.edu.gr/";"Goutsos (2010)";"https://www.clarin.eu/resource-families/reference-corpora#Goutsos%202010"; -"Diachronic corpus of Greek of the 20th century";"http://hdl.grnet.gr/11500/UOA-0000-0000-2572-6";"Greek";"20 million words";;"CC BY-NC";;"This corpus includes Greek texts published in the 20th century.#SEPThe corpus is available for download from CLARIN:EL.";"Referenca corpora";"Download";"http://hdl.grnet.gr/11500/UOA-0000-0000-2572-6";;; -"Hellenic National Corpus";"http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E2-9";"Greek";"47 million words";"sentence segmented";"proprietary";;"This corpus includes Greek texts published from 1990 onwards.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Referenca corpora";"Concordancer";"https://hnc.ilsp.gr/index.php?current_page=main";"Gavrilidou (2002)";"https://www.clarin.eu/resource-families/reference-corpora#Gavrilidou%202002"; -"Hungarian National Corpus";"https://hdl.handle.net/11372/LRT-345";"Hungarian";"190 million tokens";"PoS-tagged";"free after registration";;"This corpus includes Hungarian texts (newspapers, literature, scientific articles, official and personal documents).#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Referenca corpora";"Concordancer";"https://corpus.nytud.hu/mnsz/";"Váradi (2002)";"http://www.lrec-conf.org/proceedings/lrec2002/sumarios/217.htm"; -"The Icelandic Gigaword Corpus";"http://hdl.handle.net/20.500.12537/192";"Icelandic";"1.9 billion words";"MSD-tagged, lemmatized";"CC-BY and a special user licence";;"This corpus includes Icelandic texts (newspapers, parliamentary proceedings, adjudications, fiction and non-fiction) published until 2017.#SEPThe corpus is encoded in TEI. Non-linguistic metadata include bibliographic information. Aside from written materials, the corpus also contains transcriptions of spoken language.#SEPThe corpus is available for online browsing and download through CLARIN-IS (in two subsets, each with its own licence).";"Referenca corpora";"Concordancer#SEPDownload subset 1#SEPDownload subset 2";"https://clarin.is/en/resources/gigaword/#SEPhttp://hdl.handle.net/20.500.12537/41#SEPhttp://hdl.handle.net/20.500.12537/33";"Steingrímsson et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Steingr%C3%ADmsson%20et%20al.%202018"; -"Balanced Corpus of Modern Latvian (LVK2022)";"http://hdl.handle.net/20.500.12574/84";"Latvian";"122.9 million tokens";"MSD-tagged, lemmatized";;;"This corpus includes texts from journalism, fiction, science, Wikipedia, legal documents, parliamentary subscripts, and subtitles.#SEPThe corpus is available for online browsing through the noSketch Engine concordancer.";"Referenca corpora";"Concordancer";"https://nosketch.korpuss.lv/#dashboard?corpname=LVK2022";;; -"Corpus of the Contemporary Lithuanian Language";"http://hdl.handle.net/20.500.11821/16";"Lithuanian";"208.4 million tokens";"MSD-tagged, lemmatized";"CLARIN RES";;"This corpus includes Lithuanian texts (mostly newspapers but also fiction, non-fiction, and specialised magazines) published between 1990 and 2008.#SEPThe corpus is encoded in TEI. Non-linguistic metadata includes bibliographic information. Aside from written materials, the corpus also contains transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Referenca corpora";"Concordancer";"https://corpus.vdu.lt/";;; -"The Lexicographic Corpus for Norwegian Bokmål (LBK)";"https://www.hf.uio.no/iln/om/organisasjon/tekstlab/prosjekter/lbk/";"Norwegian (Bokmål)";"100 million tokens";"PoS-tagged, lemmatized";"CLARIN_ACA-NC-LOC-ND";;"This corpus includes representative Norwegian (Bokmål) texts (newspapers and periodicals, non-fiction, fiction, TV subtitles, and small print) published between 1985 and 2013.#SEPThe corpus is available for online browsing through the concordancer Glossa (CLARINO).";"Referenca corpora";"Concordancer";"https://tekstlab.uio.no/glossa2/bokmal";"Lain Knudsen and Vatvedt Fjeld (2013)";"https://www.clarin.eu/resource-families/reference-corpora#Lain%20Knudsen%20and%20Vatvedt%20Fjeld%202013"; -"Norsk Ordboks Nynorskkorpus (NNK)";"http://hdl.handle.net/11495/E1A3-9361-1821-1";"Norwegian (Nynorsk)";"107.8 million words";"MSD-tagged, lemmatized";"CLARIN_RES-NC-DEP";;"This corpus includes representative Norwegian (Nynorsk) texts published between 1866 and 2012. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the Corpuscle concordancer (CLARINO).";"Referenca corpora";"Concordancer";"https://clarino.uib.no/korpuskel/metadata?identifier=nnk";;; -"National Corpus of Polish";"http://hdl.handle.net/11372/LRT-676";"Polish";"1.8 billion tokens";"MSD-tagged, lemmatized";;;"This is a written and spoken corpus that includes representative Polish texts published between 1945 and 2010.#SEPThe corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author. Aside from written materials, the corpus also includes transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Referenca corpora";"Concordancer";"https://nkjp.pl/index.php?page=3&lang=1";"Przepiórkowski et al. (2012)";"https://www.clarin.eu/resource-families/reference-corpora#Przepi%C3%B3rkowski%20et%20al.%202012"; -"Written corpus ccGigafida 1.0";"http://hdl.handle.net/11356/1035";"Slovenian";"126.9 million tokens, 103.2 million words, 31,722 texts";"MSD-tagged, lemmatized";"CC-BY-NC-SA 4.0";;"This corpus includes representative Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThis corpus is a downloadable subset of the representative Gigafida corpus (version 1). It can be downloaded from the CLARIN.SI repository.";"Referenca corpora";"Download";"http://hdl.handle.net/11356/1035";"Erjavec and Logar (2012)";"https://www.clarin.eu/resource-families/reference-corpora#Erjavec%20and%20Logar%20Berginc%202012"; -"Written corpus ccKres 1.0";"http://hdl.handle.net/11356/1034";"Slovenian";"12.2 million tokens, 9.8 million words";"MSD-tagged, lemmatized";"CC-BY";;"This corpus includes balanced Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThis corpus is a downloadable subset of the balanced Kres corpus. It can be downloaded from the CLARIN.SI repository.";"Referenca corpora";"Download";"http://www.korpus-kres.net/";"Erjavec and Logar (2012)";"https://www.clarin.eu/resource-families/reference-corpora#Erjavec%20and%20Logar%20Berginc%202012"; -"Written corpus Gigafida 2.0";"http://hdl.handle.net/11356/1320";"Slovenian";"1.3 billion tokens, 1.1 billion words, 38,310 texts";"MSD-tagged, lemmatized";"Individual terms of agreement";;"This corpus includes representative Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2018. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThe corpus is available for online browsing through the noSketch Engine concordancer (CLARIN.SI distribution), as well as through a dedicated search engine.";"Referenca corpora";"noSketchEngine#SEPConcordancer";"https://www.clarin.si/noske/run.cgi/corp_info?corpname=gfida20_dedup&struct_attr_stats=1#SEPhttps://viri.cjvt.si/gigafida/";"Krek et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Krek%20et%20al.%202016"; -"Written corpus Kres 1.0";"http://www.korpus-kres.net/";"Slovenian";"99 million words";"MSD-tagged, lemmatized";"Individual terms of agreement";;"This corpus includes balanced Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011.#SEPThis corpus is a balanced subset of the representative Gigafida corpus (version 1). The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Referenca corpora";"Concordancer";"https://www.korpus-kres.net/";"Krek et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Krek%20et%20al.%202016"; -"CorCenCC: Corpws Cenedlaethol Cymraeg Cyfoes – the National Corpus of Contemporary Welsh";"http://hdl.handle.net/20.500.12024/2564";"Welsh";"11 million words";;"CC BY-NC-SA 4.0";;"This corpus contains spoken, written and digital (e-language) Welsh. The corpus is accompanied by an online teaching and learning toolkit – Y Tiwtiadur – which draws directly on the data from the corpus to provide resources for Welsh language learning at all ages and levels.#SEPThe corpus is available for online browsing through a dedicated webpage and by request.";"Referenca corpora";"Request#SEPConcordancer";"https://research.cardiff.ac.uk/converis/portal/detail/Dataset/119878310?auxfun=&lang=en_GB#SEPhttps://www.corcencc.org/explore";"Dawn et al. (2020)";"https://www.clarin.eu/resource-families/reference-corpora#Dawn%20et%20al.%202020"; +"AbNC: Abkhaz National Corpus";"https://clarino.uib.no/abnc/page";"Abkhaz";"10 million words";"MSD-tagged, lemmatized";"CLARIN_PUB-BY-NC-ND";;"This corpus includes Abkhaz texts published between 1920 and 2016. The corpus is encoded in TEI.#SEPThe corpus is available for online browsing through the Corpuscle concordancer (CLARINO distribution).";"Reference corpora";"Concordancer";"https://clarino.uib.no/abnc";"Meurer (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Meurer%202018"; +"Bulgarian National Reference Corpus (BNRC)";"http://webclark.org/?locale=en";"Bulgarian";"70 million tokens";"tokenized, PoS-tagged";"Individual terms of agreement";;"This corpus includes Bulgarian texts taken from news media, literature, and administrative documents between 1997 and 2002.#SEPThe tokenised corpus is available through WebCLaRK, while the PoS-tagged version is available only upon request.";"Reference corpora";"Concordancer";"https://webclark.org/?locale=en";"Simov et al. (2004)";"https://www.clarin.eu/resource-families/reference-corpora#Kiril%20et%20al.%202004"; +"Croatian language corpus Riznica 0.1";"http://hdl.handle.net/11356/1180";"Croatian";"101.8 million tokens, 85.3 million words, 4.7 million sentences, 14,781 texts";"sentence segmented, PoS-tagged, lemmatized";"CC BY-NC-SA 4.0";;"This corpus includes Croatian texts taken from fiction (28%) and specialised texts (72%).#SEPThe corpus is available for online browsing via noSketch Engine and KonText and for download from the CLARIN.SI repository.";"Reference corpora";"noSketchEngine#SEPKonText#SEPDownload";"https://www.clarin.si/noske/run.cgi/corp_info?corpname=riznica#SEPhttps://www.clarin.si/kontext/first_form?corpname=riznica#SEPhttp://hdl.handle.net/11356/1180";"Ćavar and Brozović Rončević (2012)";"https://www.clarin.eu/resource-families/reference-corpora#%C4%86avar%20and%20Brozovi%C4%87%20Ron%C4%8Devi%C4%87%202012"; +"Croatian National Corpus";"http://hdl.handle.net/11372/LRT-233";"Croatian";"101 million tokens";;;;"This corpus includes Croatian texts taken from newspapers, magazines, popular texts, and fiction.#SEPThe corpus is available for online browsing through the noSketch Engine.";"Reference corpora";"Concordancer";"http://filip.ffzg.hr/cgi-bin/run.cgi/first_form";"Tadić (2002)";"https://www.clarin.eu/resource-families/reference-corpora#Tadi%C4%87%202002"; +"SYN2005: balanced corpus of written Czech";"http://hdl.handle.net/11858/00-097C-0000-0023-119E-8";"Czech";"100 million words";"MSD-tagged, lemmatized";"Czech National Corpus (Shuffled Corpus Data)";;"This corpus includes Czech texts published between 2000 and 2004. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.";"Reference corpora";"Concordancer#SEPDownload";"https://kontext.korpus.cz/first_form?corpname=syn2005#SEPhttp://hdl.handle.net/11858/00-097C-0000-0023-119E-8";"Hnátková et al. (2014)";"https://www.clarin.eu/resource-families/reference-corpora#Hn%C3%A1tkov%C3%A1%20et%20al.%202014"; +"SYN2010: balanced corpus of written Czech";"http://hdl.handle.net/11858/00-097C-0000-0023-119F-6";"Czech";"100 million words";"MSD-tagged, lemmatized";"Czech National Corpus (Shuffled Corpus Data)";;"This corpus includes Czech fiction, professional literature, newspapers etc. published between 2005 and 2009. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.";"Reference corpora";"Concordancer#SEPDownload";"https://kontext.korpus.cz/first_form?corpname=syn2010#SEPhttp://hdl.handle.net/11858/00-097C-0000-0023-119F-6";"Hnátková et al. (2014)";"https://www.clarin.eu/resource-families/reference-corpora#Hn%C3%A1tkov%C3%A1%20et%20al.%202014"; +"SYN2015: representative corpus of written Czech";"http://hdl.handle.net/11234/1-1593";"Czech";"100 million words";"MSD-tagged, lemmatized";"Czech National Corpus (Shuffled Corpus Data)";;"This corpus includes Czech fiction, professional literature, newspapers etc. published between 2010 and 2014. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the KonText concordancer and can be downloaded from the LINDAT repository.";"Reference corpora";"Concordancer#SEPDownload";"https://kontext.korpus.cz/first_form?corpname=syn2015#SEPhttp://hdl.handle.net/11234/1-1593";"Hnátková et al. (2014)";"https://www.clarin.eu/resource-families/reference-corpora#Hn%C3%A1tkov%C3%A1%20et%20al.%202014"; +"DK-CLARIN Reference Corpus of General Danish";"http://hdl.handle.net/20.500.12115/36";"Danish";"45.1 million words";"PoS-tagged, sentence and paragraph segmentation, lemmatized";"CLARIN ACA-NC";;"This corpus includes Danish texts published between 2008 and 2011.#SEPThe corpus is encoded in TEI. Non-linguistic metadata includes information on source and year of publication.#SEPThe corpus is available for download from the CLARIN-DK repository.";"Reference corpora";"Download";"http://hdl.handle.net/20.500.12115/36";;; +"SoNaR";"http://hdl.handle.net/10032/tm-a2-h5";"Dutch";"500 million words";"PoS-tagged, lemmatized, named entities; coreference annotation and annotation of spatial and temporal relations for the manually annotated SoNaR-1 subset ";"Terms of Agreement";;"This corpus includes representative Dutch texts (fiction, brochures, magazines, legal texts, newspapers, parliamentary proceedings, and computer-mediated communication).#SEPAside from written materials, the corpus also contains transcriptions of spoken language. The corpus is encoded in FoLiA.#SEPThe corpus is available for online browsing through the OpenSONAR concordancer and can be downloaded from the Dutch Language Institute (CLARIAH-NL).";"Reference corpora";"Concordancer#SEPDownload subset 1#SEPDownload subset 2";"https://portal.clarin.nl/node/4195#SEPhttp://hdl.handle.net/10032/tm-a2-c7#SEPhttp://hdl.handle.net/10032/tm-a2-h5 ";;; +"Corpus of Contemporary American English – Kielipankki version";"http://urn.fi/urn:nbn:fi:lb-2019031901";"English (American)";"440 million words, 190,000 texts";"PoS-tagged, lemmatized";"CLARIN ACA (online version), CLARIN RES (downloadable version)";;"This corpus includes American English texts evenly divided into the spoken, fiction, magazine, newspaper, and academic genres (around 88 million words each) published between 1990 and 2012.#SEPThe corpus is available for download from the Finnish Language Bank as well as for online browsing through the concordancer Korp (FIN-CLARIN distribution).";"Reference corpora";"Concordancer#SEPDownload";"http://urn.fi/urn:nbn:fi:lb-2017061933#SEPhttp://urn.fi/urn:nbn:fi:lb-2019031901";;; +"British National Corpus";"http://hdl.handle.net/20.500.12024/2554";"English (British)";"100 million words";"PoS-tagged, lemmatized";"BNC User Licence (restricted for the downloadable version)";;"This corpus includes English texts (fiction, magazines, newspapers, and academic writing) published between 1980 and 1993.#SEPThe corpus is encoded in TEI. Non-linguistic metadata include contextual and bibliographic information. Aside from written materials, the corpus also includes transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer and can be downloaded from the Oxford Text Archive (CLARIN-UK).";"Reference corpora";"Concordancer#SEPDownload";"https://www.natcorp.ox.ac.uk/#SEPhttp://hdl.handle.net/20.500.12024/2554";;; +"Estonian National Corpus 2019";"https://hdl.handle.net/10.15155/3-00-0000-0000-0000-08489L";"Estonian";"1.5 billion words";"MSD-tagged, lemmatized";"CC-BY-SA";;"This corpus includes Estonian texts published between 1990 and 2019. Amongst others, this corpus contains the Estonian Reference Corpus as a subcorpus.#SEPThe corpus is available for download from META-SHARE (CELR distribution).";"Reference corpora";"Download";"http://hdl.handle.net/10.15155/3-00-0000-0000-0000-08489L";;; +"Estonian Reference Corpus";"http://hdl.handle.net/11372/LRT-1068";"Estonian";"175 million words";"MSD-tagged, lemmatized";"free for non-commercial use";;"This corpus includes Estonian texts (fiction, PhD theses, newspapers, magazines, parliamentary transcriptions, computer-mediated communication) published between 1990 and 2007. The corpus is encoded in TEI.#SEPThe corpus is available for online browsing through a dedicated concordancer and is available for download from CELR.";"Reference corpora";"Concordancer#SEPDownload";"https://www.keeleveeb.ee/#SEPhttp://www.cl.ut.ee/korpused/segakorpus/";;; +"DeReKo";"http://www1.ids-mannheim.de/kl/projekte/korpora/";"German";"31.7 billion words";"MSD-tagged, lemmatized";"CC-BY-SA";;"This corpus includes German texts in a wide variety of genres published from 1947 onwards. Non-linguistic metadata include rich bibliographic information and partial layout information.#SEPPart of the corpus is available for download from a dedicated webpage (CLARIN-D distribution), while the entire corpus can be queried online through the COSMAS II platform.";"Reference corpora";"Concordancer#SEPDownload";"https://www1.ids-mannheim.de/kl/projekte/korpora/verfuegbarkeit.html#SEPhttp://www1.ids-mannheim.de/kl/projekte/korpora/verfuegbarkeit.html";"Kupietz et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Kupietz%20et%20al.%202018"; +"Corpus of Greek Texts";"http://hdl.grnet.gr/11500/UOA-0000-0000-2471-8";"Greek";"27.6 million words";;"CC-BY-NC, ACA";;"This corpus includes representative Greek texts published between 1990 and 2010. Aside from written materials, the corpus also includes transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Reference corpora";"Concordancer";"https://www.sek.edu.gr/";"Goutsos (2010)";"https://www.clarin.eu/resource-families/reference-corpora#Goutsos%202010"; +"Diachronic corpus of Greek of the 20th century";"http://hdl.grnet.gr/11500/UOA-0000-0000-2572-6";"Greek";"20 million words";;"CC BY-NC";;"This corpus includes Greek texts published in the 20th century.#SEPThe corpus is available for download from CLARIN:EL.";"Reference corpora";"Download";"http://hdl.grnet.gr/11500/UOA-0000-0000-2572-6";;; +"Hellenic National Corpus";"http://hdl.grnet.gr/11500/ATHENA-0000-0000-23E2-9";"Greek";"47 million words";"sentence segmented";"proprietary";;"This corpus includes Greek texts published from 1990 onwards.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Reference corpora";"Concordancer";"https://hnc.ilsp.gr/index.php?current_page=main";"Gavrilidou (2002)";"https://www.clarin.eu/resource-families/reference-corpora#Gavrilidou%202002"; +"Hungarian National Corpus";"https://hdl.handle.net/11372/LRT-345";"Hungarian";"190 million tokens";"PoS-tagged";"free after registration";;"This corpus includes Hungarian texts (newspapers, literature, scientific articles, official and personal documents).#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Reference corpora";"Concordancer";"https://corpus.nytud.hu/mnsz/";"Váradi (2002)";"http://www.lrec-conf.org/proceedings/lrec2002/sumarios/217.htm"; +"The Icelandic Gigaword Corpus";"http://hdl.handle.net/20.500.12537/192";"Icelandic";"1.9 billion words";"MSD-tagged, lemmatized";"CC-BY and a special user licence";;"This corpus includes Icelandic texts (newspapers, parliamentary proceedings, adjudications, fiction and non-fiction) published until 2017.#SEPThe corpus is encoded in TEI. Non-linguistic metadata include bibliographic information. Aside from written materials, the corpus also contains transcriptions of spoken language.#SEPThe corpus is available for online browsing and download through CLARIN-IS (in two subsets, each with its own licence).";"Reference corpora";"Concordancer#SEPDownload subset 1#SEPDownload subset 2";"https://clarin.is/en/resources/gigaword/#SEPhttp://hdl.handle.net/20.500.12537/41#SEPhttp://hdl.handle.net/20.500.12537/33";"Steingrímsson et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Steingr%C3%ADmsson%20et%20al.%202018"; +"Balanced Corpus of Modern Latvian (LVK2022)";"http://hdl.handle.net/20.500.12574/84";"Latvian";"122.9 million tokens";"MSD-tagged, lemmatized";;;"This corpus includes texts from journalism, fiction, science, Wikipedia, legal documents, parliamentary subscripts, and subtitles.#SEPThe corpus is available for online browsing through the noSketch Engine concordancer.";"Reference corpora";"Concordancer";"https://nosketch.korpuss.lv/#dashboard?corpname=LVK2022";;; +"Corpus of the Contemporary Lithuanian Language";"http://hdl.handle.net/20.500.11821/16";"Lithuanian";"208.4 million tokens";"MSD-tagged, lemmatized";"CLARIN RES";;"This corpus includes Lithuanian texts (mostly newspapers but also fiction, non-fiction, and specialised magazines) published between 1990 and 2008.#SEPThe corpus is encoded in TEI. Non-linguistic metadata includes bibliographic information. Aside from written materials, the corpus also contains transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Reference corpora";"Concordancer";"https://corpus.vdu.lt/";;; +"The Lexicographic Corpus for Norwegian Bokmål (LBK)";"https://www.hf.uio.no/iln/om/organisasjon/tekstlab/prosjekter/lbk/";"Norwegian (Bokmål)";"100 million tokens";"PoS-tagged, lemmatized";"CLARIN_ACA-NC-LOC-ND";;"This corpus includes representative Norwegian (Bokmål) texts (newspapers and periodicals, non-fiction, fiction, TV subtitles, and small print) published between 1985 and 2013.#SEPThe corpus is available for online browsing through the concordancer Glossa (CLARINO).";"Reference corpora";"Concordancer";"https://tekstlab.uio.no/glossa2/bokmal";"Lain Knudsen and Vatvedt Fjeld (2013)";"https://www.clarin.eu/resource-families/reference-corpora#Lain%20Knudsen%20and%20Vatvedt%20Fjeld%202013"; +"Norsk Ordboks Nynorskkorpus (NNK)";"http://hdl.handle.net/11495/E1A3-9361-1821-1";"Norwegian (Nynorsk)";"107.8 million words";"MSD-tagged, lemmatized";"CLARIN_RES-NC-DEP";;"This corpus includes representative Norwegian (Nynorsk) texts published between 1866 and 2012. The corpus is encoded in XML.#SEPThe corpus is available for online browsing through the Corpuscle concordancer (CLARINO).";"Reference corpora";"Concordancer";"https://clarino.uib.no/korpuskel/metadata?identifier=nnk";;; +"National Corpus of Polish";"http://hdl.handle.net/11372/LRT-676";"Polish";"1.8 billion tokens";"MSD-tagged, lemmatized";;;"This is a written and spoken corpus that includes representative Polish texts published between 1945 and 2010.#SEPThe corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author. Aside from written materials, the corpus also includes transcriptions of spoken language.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Reference corpora";"Concordancer";"https://nkjp.pl/index.php?page=3&lang=1";"Przepiórkowski et al. (2012)";"https://www.clarin.eu/resource-families/reference-corpora#Przepi%C3%B3rkowski%20et%20al.%202012"; +"Written corpus ccGigafida 1.0";"http://hdl.handle.net/11356/1035";"Slovenian";"126.9 million tokens, 103.2 million words, 31,722 texts";"MSD-tagged, lemmatized";"CC-BY-NC-SA 4.0";;"This corpus includes representative Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThis corpus is a downloadable subset of the representative Gigafida corpus (version 1). It can be downloaded from the CLARIN.SI repository.";"Reference corpora";"Download";"http://hdl.handle.net/11356/1035";"Erjavec and Logar (2012)";"https://www.clarin.eu/resource-families/reference-corpora#Erjavec%20and%20Logar%20Berginc%202012"; +"Written corpus ccKres 1.0";"http://hdl.handle.net/11356/1034";"Slovenian";"12.2 million tokens, 9.8 million words";"MSD-tagged, lemmatized";"CC-BY";;"This corpus includes balanced Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThis corpus is a downloadable subset of the balanced Kres corpus. It can be downloaded from the CLARIN.SI repository.";"Reference corpora";"Download";"http://www.korpus-kres.net/";"Erjavec and Logar (2012)";"https://www.clarin.eu/resource-families/reference-corpora#Erjavec%20and%20Logar%20Berginc%202012"; +"Written corpus Gigafida 2.0";"http://hdl.handle.net/11356/1320";"Slovenian";"1.3 billion tokens, 1.1 billion words, 38,310 texts";"MSD-tagged, lemmatized";"Individual terms of agreement";;"This corpus includes representative Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2018. The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThe corpus is available for online browsing through the noSketch Engine concordancer (CLARIN.SI distribution), as well as through a dedicated search engine.";"Reference corpora";"noSketchEngine#SEPConcordancer";"https://www.clarin.si/noske/run.cgi/corp_info?corpname=gfida20_dedup&struct_attr_stats=1#SEPhttps://viri.cjvt.si/gigafida/";"Krek et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Krek%20et%20al.%202016"; +"Written corpus Kres 1.0";"http://www.korpus-kres.net/";"Slovenian";"99 million words";"MSD-tagged, lemmatized";"Individual terms of agreement";;"This corpus includes balanced Slovenian texts (newspapers, magazines, computer-mediated communication, fiction and non-fiction) published between 1990 and 2011.#SEPThis corpus is a balanced subset of the representative Gigafida corpus (version 1). The corpus is encoded in TEI. Non-linguistic metadata includes information on source, year of publication, text type, title, author.#SEPThe corpus is available for online browsing through a dedicated concordancer.";"Reference corpora";"Concordancer";"https://www.korpus-kres.net/";"Krek et al. (2018)";"https://www.clarin.eu/resource-families/reference-corpora#Krek%20et%20al.%202016"; +"CorCenCC: Corpws Cenedlaethol Cymraeg Cyfoes – the National Corpus of Contemporary Welsh";"http://hdl.handle.net/20.500.12024/2564";"Welsh";"11 million words";;"CC BY-NC-SA 4.0";;"This corpus contains spoken, written and digital (e-language) Welsh. The corpus is accompanied by an online teaching and learning toolkit – Y Tiwtiadur – which draws directly on the data from the corpus to provide resources for Welsh language learning at all ages and levels.#SEPThe corpus is available for online browsing through a dedicated webpage and by request.";"Reference corpora";"Request#SEPConcordancer";"https://research.cardiff.ac.uk/converis/portal/detail/Dataset/119878310?auxfun=&lang=en_GB#SEPhttps://www.corcencc.org/explore";"Dawn et al. (2020)";"https://www.clarin.eu/resource-families/reference-corpora#Dawn%20et%20al.%202020"; +"Spoken corpus Gos 2.0";"http://hdl.handle.net/11356/1771";"Slovenian";"1534 texts; 127,604 utterances; 2,462,368 words";"PoS-tagged, lemmatised, phonetically and orthographically transcribed";"CC BY-SA 4.0";;"This corpus contains transcripts from radio and TV shows, school lessons, private conversations, business meetings. It is composed of three different sources: Spoken corpus Gos 1.1 (112 hours, 1 million words), Spoken corpus Gos VideoLectures 4.2 (22 hours, 179,000 words), a selection from the ASR database ARTUR 1.0 (185 hours, 1.2 mllion words). #SEPThe corpus is available for download from CLARIN.SI as well as through a dedicated webconcordancer.";"Reference corpora";"Concordancer (noSketchEngine)#SEPConcordancer (KonText)#SEPDownload";"https://www.clarin.si/ske/#dashboard?corpname=gos20&struct_attr_stats=1#SEPhttps://www.clarin.si/kontext/query?corpname=gos20#SEPhttp://hdl.handle.net/11356/1771";"Verdonik and Zwitter-Vitez (2011)";"https://www.clarin.eu/resource-families/spoken-corpora#Verdonik%20and%20Zwitter-Vitez.%202012"; +"Corpus of combined Slovenian corpora metaFida 1.0";"http://hdl.handle.net/11356/1775";"Slovenian";"6 billion tokens";"MSD-tagged (MULTEXT-East), lemmatised, normalised";"various";;"This corpus contains a number of existing Slovenian corpora available through the CLARIN.SI concordances and thus provides a unified search across all the included corpora. metaFida contains over 4,7 billion words or 6 billion tokens from 15 million text published 1584 - 2022 from 34 corpora.#SEPIn the metaFida corpus we keep only information that is common to most of the selected corpora. The structure is nested very shallowly (text and paragraph), as it is then easier to create subcorpora or limit the search to individual text types. All metaFida positional attributes (word, normalised form, lemma, MULTEXT-East MSD in Slovenian and English) are considered to have multiple values, separated by a space.";"Reference corpora";"Concordancer (noSketchEngine)#SEPConcordancer (KonText)#SEPDownload";"https://www.clarin.si/ske/#dashboard?corpname=mfida10&struct_attr_stats=1#SEPhttps://www.clarin.si/kontext/query?corpname=mfida10#SEPhttp://hdl.handle.net/11356/1775";;;