-
Notifications
You must be signed in to change notification settings - Fork 5
/
commons.py
105 lines (84 loc) · 3.33 KB
/
commons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
"""
Common structures and functions used by other scripts.
"""
from xml.etree import cElementTree as ET
import logging
from nltk.tokenize import RegexpTokenizer
str_to_entailment = {'none': 0,
'entailment': 1,
'paraphrase': 2}
wrong_label_value = 3
entailment_to_str = {v: k for k, v in str_to_entailment.items()}
class Pair(object):
'''
Class representing a pair of texts from SICK or RTE.
It is meant to be used as an abstract representation for both.
'''
def __init__(self, t, h, id_, entailment, similarity):
'''
:param t: string with the text
:param h: string with the hypothesis
:param id_: int indicating id in the original file
:param entailment: int indicating entailment class
:param similarity: float
'''
self.t = t
self.h = h
self.id = id_
self.entailment = entailment
self.similarity = similarity
def read_xml(filename, need_labels, force=False):
'''
Read an RTE XML file and return a list of Pair objects.
:param filename: name of the file to read
:param need_labels: boolean indicating if labels should be present
'''
pairs = []
tree = ET.parse(filename)
root = tree.getroot()
for xml_pair in root.iter('pair'):
t = xml_pair.find('t').text
h = xml_pair.find('h').text
attribs = dict(xml_pair.items())
id_ = attribs['id']
if 'entailment' in attribs and attribs['entailment']:
ent_string = attribs['entailment'].lower()
try:
ent_value = str_to_entailment[ent_string]
except KeyError:
msg = 'Unexpected value for attribute ' \
'"entailment" at pair {}: {}'
logging.error(msg.format(id_, ent_string))
ent_value = wrong_label_value
else:
ent_value = wrong_label_value if force else None
if 'similarity' in attribs and attribs['similarity']:
similarity = float(attribs['similarity'])
else:
similarity = 0. if force else None
if need_labels and similarity is None and ent_value is None:
msg = 'Missing both entailment and similarity values ' \
'for pair {}'.format(id_)
raise ValueError(msg)
pair = Pair(t, h, id_, ent_value, similarity)
pairs.append(pair)
return pairs
def tokenize_sentence(text):
'''
Tokenize the given sentence in Portuguese.
:param text: text to be tokenized, as a string
'''
tokenizer_regexp = r'''(?ux)
# the order of the patterns is important!!
(?:[^\W\d_]\.)+| # one letter abbreviations, e.g. E.U.A.
\d+(?:[.,]\d+)*(?:[.,]\d+)| # numbers in format 999.999.999,99999
\w+(?:\.(?!\.|$))?| # words with numbers (including hours as 12h30),
# followed by a single dot but not at the end of sentence
\d+(?:[-\\/]\d+)*| # dates. 12/03/2012 12-03-2012
\$| # currency sign
-+| # any sequence of dashes
\S # any non-space character
'''
tokenizer = RegexpTokenizer(tokenizer_regexp)
return tokenizer.tokenize(text)