-
Notifications
You must be signed in to change notification settings - Fork 31
/
tfidf.py
73 lines (61 loc) · 1.97 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import re
from math import log
punct = '[.,!«»?&@"$\[\]\(\):;%#&\'—-]'
def preprocessing(text): # функция предобработки текста
text_wo_punct = re.sub(punct, '', text.lower()) # удаляем пунктуацию, приводим в нижний регистр
words = text_wo_punct.strip().split() # делим по пробелам
return words
def count_tf(word, text):
i = 0
for w in text:
if w == word:
i += 1
#i = text.count(word)
tf = i / len(text)
return tf
def count_df(word, texts):
i = 0
#for text in texts:
# for w in text:
# if w == word:
# i += 1
# break
i = [1 for text in texts if word in text]
df = sum(i)
return df
def count_idf(word, texts):
df = count_df(word, texts)
idf = len(texts)/ (1 + df)
return idf
def count_tfidf(word, text, texts):
tf = count_tf(word, text)
idf = count_idf(word, texts)
tfidf = log(tf, 10) * log(idf, 10)
return tfidf
def main():
for root, dirs, files in os.walk('wikipedia'):
texts = {}
for f in files:
with open(os.path.join(root, f), 'r', encoding='utf-8') as t:
content = t.read()
text = preprocessing(content)
texts[f] = text
raw_texts = list(texts.values())
for text in texts:
dic_tfidf = {}
for word in texts[text]:
if word in dic_tfidf:
continue
tfidf = count_tfidf(word, texts[text], raw_texts)
dic_tfidf[word] = tfidf
print('Ключевые слова для текста {}:'.format(text))
i = 0
for el in sorted(dic_tfidf, key=lambda x: dic_tfidf[x]):
print(el, dic_tfidf[el])
i += 1
if i > 3:
break
#main()
if __name__ == '__main__':
main()