-
Notifications
You must be signed in to change notification settings - Fork 2
/
sogouC_prepare.py
111 lines (99 loc) · 4.87 KB
/
sogouC_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#coding:utf-8
import os, sys
import jieba
from datetime import datetime
reload(sys)
sys.setdefaultencoding("utf-8")
stopWordList= [','.decode('utf-8'), '的'.decode('utf-8'), '\n', ' ', '。'.decode('utf-8'), '、'.decode('utf-8'), '在'.decode('utf-8'),
'了'.decode('utf-8'), '是'.decode('utf-8'), '“'.decode('utf-8'), '”'.decode('utf-8'), '&', 'nbsp', '和'.decode('utf-8'),
':'.decode('utf-8'), ';'.decode('utf-8'), '有'.decode('utf-8'), '也'.decode('utf-8'), '我'.decode('utf-8'), ','.decode('utf-8'),
'对'.decode('utf-8'), '就'.decode('utf-8'), '中'.decode('utf-8'), '他'.decode('utf-8'), ')'.decode('utf-8'), '('.decode('utf-8'),
'-'.decode('utf-8'), ';'.decode('utf-8'), ')'.decode('utf-8'), '.', '('.decode('utf-8'), '?'.decode('utf-8'), '》'.decode('utf-8'),
'《'.decode('utf-8'), ':', '[', ']'.decode('utf-8'), '!'.decode('utf-8'), '\"']
stopWOrdSet = set(stopWordList)
print stopWOrdSet
class sogouC(object):
def __init__(self, root_path, sub_path_list, vocab_mini_count = 5):
self.root_path = root_path
self.original_path = os.path.join(root_path, 'original\\')
self.splited_path = os.path.join(root_path, 'splitted\\')
self.sub_path_list = sub_path_list
self.vocab_mini_count = vocab_mini_count
def split_words(self):
for sub_path in self.sub_path_list:
original_path = os.path.join(self.original_path, sub_path)
splited_path = os.path.join(self.splited_path, sub_path)
print original_path, 'splitting start...'
if not os.path.isdir(splited_path):
os.makedirs(splited_path)
dirs = os.listdir(original_path)
for filepath in dirs:
splited_doc_cache = []
with open(os.path.join(original_path, filepath), 'r') as f1:
lines = f1.readlines()
for line in lines[1:]:
words = list(jieba.cut(line.strip()))
splited_doc_cache.append(words)
with open(os.path.join(splited_path, filepath), 'w') as f2:
line = ''
for words in splited_doc_cache:
for word in words[2:]:
if word in stopWOrdSet: continue
line = line + word + ' '
f2.write(line)
def get_vocab(self):
vocab_map = dict()
vocab_set = set()
for sub_path in self.sub_path_list:
begin = datetime.now()
splited_path = os.path.join(self.splited_path, sub_path)
print splited_path, 'read start...'
dirs = os.listdir(splited_path)
for filepath in dirs:
with open(os.path.join(splited_path, filepath), 'r') as f1:
all_the_text = f1.read()
words = all_the_text.split(' ')
for word in words:
word = word.strip()
if word not in vocab_map:
vocab_map[word] = 0
vocab_map[word] += 1
vocab_set.add(word)
end = datetime.now()
print "time cost is %d second."%((end-begin).seconds)
vocab_set_sorted = sorted(vocab_map.iteritems(), key=lambda d: d[1], reverse=True)
vocab_set_valid = []
with open(os.path.join(root_path, 'dict.txt'), 'w') as f2:
for word, i in vocab_set_sorted:
if i > self.vocab_mini_count:
vocab_set_valid.append(word)
f2.write(word + '\n')
return vocab_set_valid
def get_vocab_new(self):
vocab_map = dict()
vocab_set = set()
begin = datetime.now()
sub_path = 'C0000081'
splited_path = os.path.join(self.splited_path, sub_path)
dirs = os.listdir(splited_path)
for filepath in dirs:
with open(os.path.join(splited_path, filepath), 'r') as f1:
all_the_text = f1.read()
words = all_the_text.split(' ')
for word in words:
word = word.strip()
if word not in vocab_map.keys():
vocab_map[word] = 0
vocab_map[word] += 1
vocab_set.add(word)
end = datetime.now()
print "time cost is %d ms." % ((end - begin).microseconds/1000)
abs_path = os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir))
root_path = os.path.join(abs_path, "data\\SogouC.reduced\\")
sub_paths = ['C000008\\', 'C000010\\', 'C000013\\', 'C000014\\', 'C000016\\', 'C000020\\', 'C000022\\', 'C000023\\', 'C000024\\']
print "split words..."
sc = sogouC(root_path, sub_paths, vocab_mini_count=5)
sc.split_words()
print "get vocab..."
sc.get_vocab()
#print len(vocab)