-
Notifications
You must be signed in to change notification settings - Fork 73
/
sentence.py
67 lines (57 loc) · 1.71 KB
/
sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#encoding=utf8
from enum import Enum
"""
输出语料的标签前缀
该类主要是原始语料标签与输出语料标签的映射关系
"""
class TagPrefix(Enum):
general = '' # 非命名实体的标记
t = 'Date_' # 时间类型的标记
@classmethod
def convert(cls):
dicTag = {}
for name, member in TagPrefix.__members__.items():
dicTag[name] = member.value
return dicTag
"""
输出语料的标签后缀 BMES 标注体系
"""
class TagSurfix(Enum):
S = 's'
B = 'b'
M = 'm'
E = 'e'
class Sentence:
def __init__(self):
self.tokens = [] # token
self.tags = [] # token对应的类型
self.chars = 0
def addToken(self, t, tag):
self.chars += len(t)
self.tokens.append(t)
self.tags.append(tag)
def clear(self):
self.tokens = []
self.chars = 0
self.tags = []
"""
按照字符拆分token列表中的每一个token
其中x里面存储的是token的字符序列, y中存储的是相关序列对应的标记
"""
def generate_tr_line(self, x, y):
for idx in range(len(self.tokens)):
t = self.tokens[idx]
tagstr = self.tags[idx]
if len(t) == 1:
x.append(t[0])
y.append(tagstr + TagSurfix.S.value)
else:
nn = len(t)
for i in range(nn):
x.append(t[i])
if i == 0:
y.append(tagstr + TagSurfix.B.value)
elif i == (nn - 1):
y.append(tagstr + TagSurfix.E.value)
else:
y.append(tagstr + TagSurfix.M.value)