-
Notifications
You must be signed in to change notification settings - Fork 33
/
timex.py
359 lines (314 loc) · 13.5 KB
/
timex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# Code for tagging temporal expressions in text
# For details of the TIMEX format, see http://timex2.mitre.org/
from __future__ import print_function
from __future__ import unicode_literals
import re
import string
import os
import sys
# Requires eGenix.com mx Base Distribution
# http://www.egenix.com/products/python/mxBase/
try:
from mx.DateTime import *
except ImportError:
print("""
Requires eGenix.com mx Base Distribution
http://www.egenix.com/products/python/mxBase/""")
# Predefined strings.
numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \
eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \
eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \
ninety|hundred|thousand)"
day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
month = "(january|february|march|april|may|june|july|august|september| \
october|november|december)"
dmy = "(year|day|week|month)"
rel_day = "(today|yesterday|tomorrow|tonight|tonite)"
exp1 = "(before|after|earlier|later|ago)"
exp2 = "(this|next|last)"
iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+"
year = "((?<=\s)\d{4}|^\d{4})"
regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")"
regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))"
reg1 = re.compile(regxp1, re.IGNORECASE)
reg2 = re.compile(regxp2, re.IGNORECASE)
reg3 = re.compile(rel_day, re.IGNORECASE)
reg4 = re.compile(iso)
reg5 = re.compile(year)
def tag(text):
# Initialization
timex_found = []
# re.findall() finds all the substring matches, keep only the full
# matching string. Captures expressions such as 'number of days' ago, etc.
found = reg1.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)
# Variations of this thursday, next year, etc
found = reg2.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)
# today, tomorrow, etc
found = reg3.findall(text)
for timex in found:
timex_found.append(timex)
# ISO
found = reg4.findall(text)
for timex in found:
timex_found.append(timex)
# Year
found = reg5.findall(text)
for timex in found:
timex_found.append(timex)
# Tag only temporal expressions which haven't been tagged.
for timex in timex_found:
text = re.sub(timex + '(?!</TIMEX2>)', '<TIMEX2>' + timex + '</TIMEX2>', text)
return text
# Hash function for week days to simplify the grounding task.
# [Mon..Sun] -> [0..6]
hashweekdays = {
'Monday': 0,
'Tuesday': 1,
'Wednesday': 2,
'Thursday': 3,
'Friday': 4,
'Saturday': 5,
'Sunday': 6}
# Hash function for months to simplify the grounding task.
# [Jan..Dec] -> [1..12]
hashmonths = {
'January': 1,
'February': 2,
'March': 3,
'April': 4,
'May': 5,
'June': 6,
'July': 7,
'August': 8,
'September': 9,
'October': 10,
'November': 11,
'December': 12}
# Hash number in words into the corresponding integer value
def hashnum(number):
if re.match(r'one|^a\b', number, re.IGNORECASE):
return 1
if re.match(r'two', number, re.IGNORECASE):
return 2
if re.match(r'three', number, re.IGNORECASE):
return 3
if re.match(r'four', number, re.IGNORECASE):
return 4
if re.match(r'five', number, re.IGNORECASE):
return 5
if re.match(r'six', number, re.IGNORECASE):
return 6
if re.match(r'seven', number, re.IGNORECASE):
return 7
if re.match(r'eight', number, re.IGNORECASE):
return 8
if re.match(r'nine', number, re.IGNORECASE):
return 9
if re.match(r'ten', number, re.IGNORECASE):
return 10
if re.match(r'eleven', number, re.IGNORECASE):
return 11
if re.match(r'twelve', number, re.IGNORECASE):
return 12
if re.match(r'thirteen', number, re.IGNORECASE):
return 13
if re.match(r'fourteen', number, re.IGNORECASE):
return 14
if re.match(r'fifteen', number, re.IGNORECASE):
return 15
if re.match(r'sixteen', number, re.IGNORECASE):
return 16
if re.match(r'seventeen', number, re.IGNORECASE):
return 17
if re.match(r'eighteen', number, re.IGNORECASE):
return 18
if re.match(r'nineteen', number, re.IGNORECASE):
return 19
if re.match(r'twenty', number, re.IGNORECASE):
return 20
if re.match(r'thirty', number, re.IGNORECASE):
return 30
if re.match(r'forty', number, re.IGNORECASE):
return 40
if re.match(r'fifty', number, re.IGNORECASE):
return 50
if re.match(r'sixty', number, re.IGNORECASE):
return 60
if re.match(r'seventy', number, re.IGNORECASE):
return 70
if re.match(r'eighty', number, re.IGNORECASE):
return 80
if re.match(r'ninety', number, re.IGNORECASE):
return 90
if re.match(r'hundred', number, re.IGNORECASE):
return 100
if re.match(r'thousand', number, re.IGNORECASE):
return 1000
# Given a timex_tagged_text and a Date object set to base_date,
# returns timex_grounded_text
def ground(tagged_text, base_date):
# Find all identified timex and put them into a list
timex_regex = re.compile(r'<TIMEX2>.*?</TIMEX2>', re.DOTALL)
timex_found = timex_regex.findall(tagged_text)
timex_found = [re.sub(r'</?TIMEX2.*?>', '', timex) for timex in timex_found]
# Calculate the new date accordingly
for timex in timex_found:
timex_val = 'UNKNOWN' # Default value
timex_ori = timex # Backup original timex for later substitution
# If numbers are given in words, hash them into corresponding numbers.
# eg. twenty five days ago --> 25 days ago
if re.search(numbers, timex, re.IGNORECASE):
split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \
timex, re.IGNORECASE)
value = split_timex[0]
unit = split_timex[1]
num_list = [hashnum(s) for s in re.findall(numbers + '+', \
value, re.IGNORECASE)]
timex = repr(sum(num_list)) + ' ' + unit
# If timex matches ISO format, remove 'time' and reorder 'date'
if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex):
dmy = re.split(r'\s', timex)[0]
dmy = re.split(r'/|-', dmy)
timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0])
# Specific dates
elif re.match(r'\d{4}', timex):
timex_val = str(timex)
# Relative dates
elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE):
timex_val = str(base_date)
elif re.match(r'yesterday', timex, re.IGNORECASE):
timex_val = str(base_date + RelativeDateTime(days=-1))
elif re.match(r'tomorrow', timex, re.IGNORECASE):
timex_val = str(base_date + RelativeDateTime(days=+1))
# Weekday in the previous week.
elif re.match(r'last ' + week_day, timex, re.IGNORECASE):
day = hashweekdays[timex.split()[1]]
timex_val = str(base_date + RelativeDateTime(weeks=-1, \
weekday=(day,0)))
# Weekday in the current week.
elif re.match(r'this ' + week_day, timex, re.IGNORECASE):
day = hashweekdays[timex.split()[1]]
timex_val = str(base_date + RelativeDateTime(weeks=0, \
weekday=(day,0)))
# Weekday in the following week.
elif re.match(r'next ' + week_day, timex, re.IGNORECASE):
day = hashweekdays[timex.split()[1]]
timex_val = str(base_date + RelativeDateTime(weeks=+1, \
weekday=(day,0)))
# Last, this, next week.
elif re.match(r'last week', timex, re.IGNORECASE):
year = (base_date + RelativeDateTime(weeks=-1)).year
# iso_week returns a triple (year, week, day) hence, retrieve
# only week value.
week = (base_date + RelativeDateTime(weeks=-1)).iso_week[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'this week', timex, re.IGNORECASE):
year = (base_date + RelativeDateTime(weeks=0)).year
week = (base_date + RelativeDateTime(weeks=0)).iso_week[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'next week', timex, re.IGNORECASE):
year = (base_date + RelativeDateTime(weeks=+1)).year
week = (base_date + RelativeDateTime(weeks=+1)).iso_week[1]
timex_val = str(year) + 'W' + str(week)
# Month in the previous year.
elif re.match(r'last ' + month, timex, re.IGNORECASE):
month = hashmonths[timex.split()[1]]
timex_val = str(base_date.year - 1) + '-' + str(month)
# Month in the current year.
elif re.match(r'this ' + month, timex, re.IGNORECASE):
month = hashmonths[timex.split()[1]]
timex_val = str(base_date.year) + '-' + str(month)
# Month in the following year.
elif re.match(r'next ' + month, timex, re.IGNORECASE):
month = hashmonths[timex.split()[1]]
timex_val = str(base_date.year + 1) + '-' + str(month)
elif re.match(r'last month', timex, re.IGNORECASE):
# Handles the year boundary.
if base_date.month == 1:
timex_val = str(base_date.year - 1) + '-' + '12'
else:
timex_val = str(base_date.year) + '-' + str(base_date.month - 1)
elif re.match(r'this month', timex, re.IGNORECASE):
timex_val = str(base_date.year) + '-' + str(base_date.month)
elif re.match(r'next month', timex, re.IGNORECASE):
# Handles the year boundary.
if base_date.month == 12:
timex_val = str(base_date.year + 1) + '-' + '1'
else:
timex_val = str(base_date.year) + '-' + str(base_date.month + 1)
elif re.match(r'last year', timex, re.IGNORECASE):
timex_val = str(base_date.year - 1)
elif re.match(r'this year', timex, re.IGNORECASE):
timex_val = str(base_date.year)
elif re.match(r'next year', timex, re.IGNORECASE):
timex_val = str(base_date.year + 1)
elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE):
# Calculate the offset by taking '\d+' part from the timex.
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date + RelativeDateTime(days=-offset))
elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date + RelativeDateTime(days=+offset))
elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
year = (base_date + RelativeDateTime(weeks=-offset)).year
week = (base_date + \
RelativeDateTime(weeks=-offset)).iso_week[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
year = (base_date + RelativeDateTime(weeks=+offset)).year
week = (base_date + RelativeDateTime(weeks=+offset)).iso_week[1]
timex_val = str(year) + 'W' + str(week)
elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE):
extra = 0
offset = int(re.split(r'\s', timex)[0])
# Checks if subtracting the remainder of (offset / 12) to the base month
# crosses the year boundary.
if (base_date.month - offset % 12) < 1:
extra = 1
# Calculate new values for the year and the month.
year = str(base_date.year - offset // 12 - extra)
month = str((base_date.month - offset % 12) % 12)
# Fix for the special case.
if month == '0':
month = '12'
timex_val = year + '-' + month
elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE):
extra = 0
offset = int(re.split(r'\s', timex)[0])
if (base_date.month + offset % 12) > 12:
extra = 1
year = str(base_date.year + offset // 12 + extra)
month = str((base_date.month + offset % 12) % 12)
if month == '0':
month = '12'
timex_val = year + '-' + month
elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date.year - offset)
elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE):
offset = int(re.split(r'\s', timex)[0])
timex_val = str(base_date.year + offset)
# Remove 'time' from timex_val.
# For example, If timex_val = 2000-02-20 12:23:34.45, then
# timex_val = 2000-02-20
timex_val = re.sub(r'\s.*', '', timex_val)
# Substitute tag+timex in the text with grounded tag+timex.
tagged_text = re.sub('<TIMEX2>' + timex_ori + '</TIMEX2>', '<TIMEX2 val=\"' \
+ timex_val + '\">' + timex_ori + '</TIMEX2>', tagged_text)
return tagged_text
####
def demo():
import nltk
text = nltk.corpus.abc.raw('rural.txt')[:10000]
print(tag(text))
if __name__ == '__main__':
demo()