-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.py
99 lines (81 loc) · 3.06 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
__author__ = 'elmira'
from collections import defaultdict
import datetime
import numpy as np
import re
regDigits = re.compile('\\d', flags=re.U | re.DOTALL)
class Word:
def __init__(self):
self.predecessors = defaultdict(int)
self.successors = defaultdict(int)
def load_bigram_frequencies(path):
"""
:param path: path to the csv file with tab separated values - word1, word2, frequency
:return: dict
"""
d = defaultdict(Word)
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.lower().strip().split('\t')
line[1] = regDigits.sub('0', line[1])
line[3] = regDigits.sub('0', line[3])
d[line[1]].successors[line[3]] = int(line[0])
d[line[3]].predecessors[line[1]] = int(line[0])
return d
def load_word_frequencies(path, bigr):
'''
:param path: path to the csv file with tab separated values - word and frequency
:return: dict {word: frequency}
'''
d = {}
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.lower().strip().split('\t')
line[1] = regDigits.sub('0', line[1])
if line[1] in bigr:
d[line[1]] = int(line[0])
return d
def count_word_rank(d):
"""
sort words from the most frequent to the least frequent
:param d: dictionary of word frequenies
:return: array of words sorted by frequency
"""
return sorted(d, key=lambda k: d[k], reverse=True)
def elements_in_top(suc, top):
return sum(i in suc for i in top)
def bigr_freqs(w, top):
return [w[i] for i in top]
# 1grams and 2grams from ruscorpora
PATH_TO_WORD_FILE = '1grams-3.txt'
PATH_TO_BIGRAM_FILE = '2grams-3.txt'
# load data
print('loading 2gram freqs...', datetime.datetime.now())
bigrams = load_bigram_frequencies(PATH_TO_BIGRAM_FILE)
print('loading word freqs...', datetime.datetime.now())
words = load_word_frequencies(PATH_TO_WORD_FILE, bigrams)
w_sum = sum(words[w] for w in words)
ordered = count_word_rank(words)
print('generate data table', datetime.datetime.now())
#features:
# 1) words frequency,
# 2) how many of top 100 frequent words occur after word,
# 3) how many of top 100 frequent words occur before word,
# 4) length of word,
# 5) frequency of top 10 bigrams where words is in the second place,
# 6) frequency of top 10 bigrams where words is in the first place
data = np.array([[words[w]/w_sum, elements_in_top(bigrams[w].successors, ordered[:100]),
elements_in_top(bigrams[w].predecessors, ordered[:100]), len(w)] +
bigr_freqs(bigrams[w].predecessors, ordered[:10]) +
bigr_freqs(bigrams[w].successors, ordered[:10]) for w in ordered])
num_clusters = 256
from sklearn.cluster import KMeans
km = KMeans(n_clusters=num_clusters, init='random', n_init=1,verbose=2)
km.fit_predict(data)
d = defaultdict(list)
for i in zip(km.labels_, ordered):
d[i[0]].append(i[1])
with open('clusters2.txt', 'w', encoding='utf-8') as f:
for key in d:
if len(d[key]) > 1:
print(key, d[key], file=f)