-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cldfbench_koeblergothic.py
118 lines (100 loc) · 3.76 KB
/
cldfbench_koeblergothic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import csv
import json
import pathlib
import re
from functools import lru_cache
import attr
import spacy
from clldutils.misc import slug
from loanpy.scapplier import Adrc
from loanpy.utils import IPA
from lingpy.sequence.sound_classes import ipa2tokens, tokens2class
from pylexibank import Dataset as BaseDataset, Lexeme, FormSpec
from tqdm import tqdm
# install first with $ python -m spacy download de_core_news_lg
nlp = spacy.load('de_core_news_lg')
ad = Adrc("etc/WOT2EAHsc.json", "etc/invsEAH.json")
#ad = Adrc("../ronataswestoldturkic/loanpy/WOT2EAHsc.json",
# "../ronataswestoldturkic/loanpy/invsEAH.json")
ipa = IPA()
HOWMANY = 100
def trim(word):
if word == "an":
return word
return re.sub("an$", "", word)
@lru_cache(maxsize=None)
def filter_vectors(meaning):
"""
filter out stopwords, add only if vector available.
"""
return meaning if nlp(meaning).has_vector else None
@attr.s
class CustomLexeme(Lexeme):
ProsodicStructure = attr.ib(default=None)
SCA = attr.ib(default=None)
class Dataset(BaseDataset):
dir = pathlib.Path(__file__).parent
id = "koeblergothic"
lexeme_class = CustomLexeme
def cmd_makecldf(self, args):
"""
Convert the raw data to a CLDF dataset.
"""
#add sense table
args.writer.cldf.add_component(
"SenseTable",
{"name": "Spacy", "datatype": "string"},
{"name": "Parameter_ID", "datatype": "string"}
)
# add bib
args.writer.add_sources()
args.log.info("added sources")
# add concept
concepts = {}
for i, concept in enumerate(tqdm(self.concepts, "Check vectors")):
idx = str(i)+"_"+slug(concept["Sense"])
concepts[concept["Sense"]] = idx
args.writer.add_concept(
ID=idx,
Name=concept["Sense"],
Concepticon_ID=concept["Concepticon_ID"],
Concepticon_Gloss=concept["Concepticon_Gloss"],
)
for j, sense_desc in enumerate(concept["Sense"].split(", ")):
vector = filter_vectors(sense_desc)
args.writer.objects["SenseTable"].append({
"ID": str(i) + "_" + slug(sense_desc) + "-" + str(j + 1),
"Entry_ID": 0,
"Description": sense_desc.strip(),
"Spacy": vector,
"Parameter_ID": idx
})
args.log.info("added concepts and senses")
# add language
languages = args.writer.add_languages()
args.log.info("added language")
cognates = {}
cogidx = 1
adidx = 1
with open(f"cldf/adapt{HOWMANY}.csv", "w+") as f:
writer = csv.writer(f)
writer.writerow(["ID", "Form_ID", f"ad{HOWMANY}"])
for i, row in enumerate(self.raw_dir.read_csv(
"gothic.tsv", delimiter="\t", dicts=True
)):
args.writer.add_form(
Form=trim(row["Gothic"]),
Language_ID="Gothic",
Parameter_ID=concepts[row["Meaning"]],
Value=row["Gothic"],
Source="Kobler1989",
Local_ID=f"f{i}"
)
lex = args.writer.objects["FormTable"][i]
pros = ipa.get_prosody((" ".join(lex["Segments"])))
lex["ProsodicStructure"] = pros
soundclass = tokens2class(ipa2tokens(lex["Segments"]), "sca")
lex["SCA"] = "".join(soundclass)
for pred in ad.adapt(lex["Segments"], HOWMANY, pros):
writer.writerow([f"a{adidx}", f"f{str(i)}", pred])
adidx += 1