-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
107 lines (82 loc) · 3.48 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import numpy as np
import spacy
from scipy.stats import spearmanr, pearsonr
def preprocess(sentences, lowercase=True, stop_words=True, punctuation=True,
only_ascii=True, lemmatization=True):
"""
Preprocesses the given sentences applying the specified filters
and extracting the tokens that verify those filters
:param sentences: list of sentences
:param lowercase: the text is lowercased
:param stop_words: stop words are removed
:param punctuation: punctuation is removed
:param only_ascii: non-ASCII characters are removed
:param lemmatization: lemmatization is applied
:returns: preprocessed sentences
"""
nlp = spacy.load("en_core_web_sm")
preprocessed_sentences = []
for doc in nlp.pipe(sentences, disable=["tagger", "parser", "ner"]):
tokens = doc.doc
if stop_words:
tokens = list(filter(lambda t : not t.is_stop, tokens))
if punctuation:
tokens = list(filter(lambda t: not t.is_punct, tokens))
if only_ascii:
tokens = list(filter(lambda t: t.is_ascii, tokens))
if lemmatization:
tokens = list(map(lambda t: t.lemma_, tokens))
else:
tokens = list(map(lambda t: t.text, tokens))
if lowercase:
tokens = list(map(lambda t: t.lower(), tokens))
preprocessed_sentences.append(np.array(tokens))
return np.array(preprocessed_sentences)
def evaluate(corpus, methods):
"""
Computes the weigthed Pearson and Spearman correlations of a STS corpus
using the given methods
:param corpus: a corpus of sentences
:param methods: dictionary of methods to be applied
:returns: Pearson's and Spearman's correlation coefficients of every method over the corpus
"""
pearson_correlations = {}
spearman_correlations = {}
for label, method in methods:
corpus_pearson = []
corpus_spearman = []
corpus_weights = []
for dataset in corpus.keys():
sentences1, sentences2, gs = corpus[dataset]
corpus_weights.append(len(gs))
sims = method(sentences1, sentences2)
corpus_pearson.append(pearsonr(sims, gs)[0])
corpus_spearman.append(spearmanr(sims, gs)[0])
wpearson = sum(corpus_pearson[i] * corpus_weights[i] / sum(corpus_weights) for i in range(len(corpus_weights)))
wspearman = sum(corpus_spearman[i] * corpus_weights[i] / sum(corpus_weights) for i in range(len(corpus_weights)))
pearson_correlations[label] = wpearson
spearman_correlations[label] = wspearman
return pearson_correlations, spearman_correlations
def get_frequencies(corpus, threshold=0):
"""
Computes the frequencies of a corpus
:param corpus: corpus of sentences
:param threshold: minimum appearance of a word
:returns: word frequencies
"""
freqs = {}
for dataset in corpus.keys():
sentences1, sentences2, gs = corpus[dataset]
for sent in (sentences1 + sentences2):
for word in sent:
freqs[word] = freqs.get(word, 0) + 1
if threshold > 0:
new_freqs = {}
for word in freqs:
if freqs[word] >= threshold:
new_freqs[word] = freqs[word]
freqs = new_freqs
freqs['<s>'] = 1e9 + 4
freqs['</s>'] = 1e9 + 3
freqs['<p>'] = 1e9 + 2
return freqs