-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
254 lines (237 loc) · 10.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import argparse # 导入argpaerse包
from pypinyin import lazy_pinyin
from pychai import Schema # 导入pychai库
import re
import profile
import time
import sys
global count
count = 0
global a
global b
global c
Total = 0
a = 0
b = 0
c = 0
def takeFirst(elem): # 定义取元祖中第一个元素的函数
return elem[0]
"""
实现拆字的功能
"""
def func_chaizi():
wubi98 = Schema('wubi98') # 实例化一个 Schema 对象,我们将其命名为 wubi98
wubi98.run()
for nameChar in wubi98.charList:
if nameChar in wubi98.component:
scheme = wubi98.component[nameChar]
else:
tree = wubi98.tree[nameChar]
componentList = tree.flatten_with_complex(wubi98.complexRootList)
scheme = sum((wubi98.component[component] for component in componentList), tuple())
if len(scheme) == 1:
objectRoot = scheme[0]
nameRoot = objectRoot.name
if nameChar in '王土大木工目日口田山禾白月人金言立水火之已子女又幺':
info = [nameRoot] * 4
elif nameChar in '一丨丿丶乙':
info = [nameRoot] * 2 + ['田'] * 2
else:
firstStroke = objectRoot.strokeList[0].type
secondStroke = objectRoot.strokeList[1].type
if objectRoot.charlen == 2:
info = [nameRoot, firstStroke, secondStroke]
else:
lastStroke = objectRoot.strokeList[-1].type
info = [nameRoot, firstStroke, secondStroke, lastStroke]
elif len(scheme) < 4:
if nameChar in wubi98.component or tree.structure not in 'hz':
weima = '3'
elif tree.structure == 'h':
weima = '1'
elif tree.structure == 'z':
weima = '2'
lastObjectRoot = scheme[-1]
quma = wubi98.category[lastObjectRoot.strokeList[-1].type]
shibiema = quma + weima
info = [objectRoot.name for objectRoot in scheme] + [shibiema]
elif len(scheme) > 4:
scheme = scheme[:3] + scheme[-1:]
info = [objectRoot.name for objectRoot in scheme]
else:
info = [objectRoot.name for objectRoot in scheme]
code = ''.join(wubi98.rootSet[nameRoot] for nameRoot in info)
wubi98.encoder[nameChar] = code
return wubi98
def Tosplit(split_word, word):
# 从每行的开始进行匹配
if word == ('a' <= word <= 'z') or ('A' < word < 'Z') or word.isdigit() or word == '\n': # word.isdigit()只有数字
return '0'
if word in split_word.tree.keys():
# Tree.first 和 Tree.second:字按该结构进行二分之后,得到的第一部分(如王)和第二部分(如里)。
return split_word.tree[word].first.name[0], split_word.tree[word].second.name[0]
else:
return '0'
"""
实现敏感词检测
"""
def ToSearchsensitive(sentence_list, rules_list, rules_split_list, pinyinlist, sensitive_words):
global count
global a
global b
global c
ans_list=[]
for line, sentence in enumerate(sentence_list):
sensitive_index = []
for num, rules in enumerate(rules_split_list):
for i in re.finditer(rules, sentence, re.I): # 不区分大小写
res = 'Line' + str(line) + ':' + ' <' + sensitive_words[num] + '> ' + i.group()+'\n'
if num == 0:
a += 1
elif num ==1:
b += 1
elif num == 2:
c += 1
sensitive_index.append((i.span()[0], res))
count += 1
location = []
for index, word in enumerate(sentence): #对多音字进行处理
for pinyin in pinyinlist:
if ''.join(lazy_pinyin(word)) == pinyin[1] and word != pinyin[0]: #当拼音一样而字不一样
location.append((index, word)) #找出多音字的下标
s1 = list(sentence)
s1[index] = pinyin[0]
sentence = ''.join(s1) #进行多音字的替换,替换为敏感词
for num, rules in enumerate(rules_list):
for i in re.finditer(rules, sentence, re.I):
flag = 1
for duoyinzi in location:
if i.span()[0] <= duoyinzi[0] <= i.span()[1]:
s2 = list(sentence)
s2[duoyinzi[0]] = duoyinzi[1]
sentence = ''.join(s2)
flag = 0
if flag == 1:
res = 'Line' + str(line + 1) + ':' + ' <' + sensitive_words[num] + '> ' + i.group()+'\n' # group用来提出分组截获的字符串
if num == 0:
a += 1
elif num == 1:
b += 1
elif num == 2:
c += 1
sensitive_index.append((i.span()[0], res))
count += 1
else:
ans = 'Line' + str(line + 1) + ':' + ' <' + sensitive_words[num] + '> ' + sentence[i.span()[0]:i.span()[1]] + '\n'
if num == 0:
a += 1
elif num == 1:
b += 1
elif num == 2:
c += 1
sensitive_index.append((i.span()[0], res))
count += 1
sensitive_index.sort(key=takeFirst)
# 将列表中的敏感词按下标位置输出
for group in sensitive_index:
ans_list.append(group[1]) #转换成字符串形式输出
ans = 'Total: ' + str(count) + '\n'
txt_ans.write(ans)
for answer in ans_list:
txt_ans.write(answer)
print(a, b, c)
'''
for answer in sensitive_index:
print(answer)
with open(args.answer, 'w') as output_file:
output_file.write(answer)
output_file.write('\r\n')
print(count)
'''
def creatRules_pinyin(dictionary): #构造正则表达式的拼音部分规则
rules = []
for key in dictionary: # 键值
length_key = len(key)
rules_key = ''
rules_key_split = ''
for num, character in enumerate(key,start=1): #按照词典的键值构造正则表达式,返回数据以及数据下标值(index=1开始)
if 'a' <= character[0] <= 'z' or 'A' <= character[0] <= 'Z': # 英文单词
if num == length_key:
rules_key += '(?:' + character + '|' + character.upper() + ')'
else:
rules_key += '(?:' + character + '|' + character.upper() + ')[^\\u4e00-\\u9fa5]*'
else: # 中文汉字
pinyin = ''.join(lazy_pinyin(character))
if num == length_key:
rules_key += '(?:' + pinyin + '|' + pinyin[
0] + '|' + character + ')'
else:
rules_key += '(?:' + pinyin + '|' + pinyin[0] + '|' + character + ')[^\\u4e00-\\u9fa5]*'
rules.append(rules_key)
return rules
def creatRules_split(dictionary): #构造正则表达式拆字部分规则
rules = []
for key in dictionary: # 键值
length_key = len(key)
rules_key = ''
for num, character in enumerate(key,start=1): # 每个字
if 'a' <= character[0] <= 'z' or 'A' <= character[0] <= 'Z': # 英文单词
continue
else: # 中文汉字
character_split = ''
for pianpang in dictionary[key][num-1]:
if pianpang == '0':
continue
else:
character_split += pianpang # pianpang= '氵去'
if num == length_key:
rules_key += '(?:' + character_split + ')'
else:
rules_key += '(?:' + character_split + ')[^\\u4e00-\\u9fa5]*'
if rules_key != '':
rules.append(rules_key)
return rules
def Creatdictionary(sensitive_words, text):
pinyinlist = []
for word in sensitive_words:
pinyin_word_list = []
for character in word:
if 'a' <= character <= 'z' or 'A' <= character <= 'Z':
break
else:
pinyinlist.append((character, ''.join(lazy_pinyin(character)))) #构造拼音列表,列表元素为元组[('法','fa'),('轮’,'lun')]
dictionary = {}
split = func_chaizi()
for word in sensitive_words:
if 'a' <= word[0] <= 'z' or 'A' <= word[0] <= 'Z': #敏感词为英文
Word_dictionary = {word: None} #当敏感词是英文时,key=word,值为None
dictionary.update(Word_dictionary) #载入词典
continue
split_word_list = []
for character in word: #进行拆字
split_word_list.append(Tosplit(split, character))
split_word_tuple = tuple(split_word_list) #tuplr函数将列表转换为元组
Word_dictionary = {word: split_word_tuple} #若是中文,此时key=word,值为二分拆字的结果
dictionary.update(Word_dictionary) #载入词典
ToSearchsensitive(text, creatRules_pinyin(dictionary), creatRules_split(dictionary), pinyinlist, sensitive_words) #开始敏感词检测
if __name__ == '__main__':
#读取文件
#txt_word = open(args.words, 'r', encoding='utf-8')
#txt_org = open(args.org, 'r', encoding='utf-8') #进行IO检验
txt_word = open(sys.argv[1], 'r', encoding='utf-8')
txt_org = open(sys.argv[2], 'r', encoding='utf-8')
txt_ans = open(sys.argv[3], 'w', encoding='utf-8')
# 原文与敏感词读取
read_ans = txt_word.read()
sensitive_words = read_ans.split('\n')
#try:
# org_file = open('org.txt', encoding='utf-8')
# except IOError:
# print("Not Found")
#else:
# print("Successfully Load")
# txt_ans = open(args.ans, 'w', encoding='utf-8')
# read_ans = txt_word.read()
# sensitive_words = read_ans.split('\n') #通过换行符进行切块
text = txt_org.readlines()
Creatdictionary(sensitive_words, text) #构造敏感词词典