-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
94 lines (75 loc) · 2.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import re
import os
import itertools as itt
import time
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
users_name = set()
def get_data(url):
with open(url, 'r', encoding='utf-8') as file:
text = file.read()
soup = BeautifulSoup(text, features='html.parser')
messages = soup.find_all('div', class_=re.compile('message default'))
messages_data = []
last_user = ''
for msg in messages:
temp_soup = BeautifulSoup(str(msg), features='html.parser')
if 'joined' in temp_soup.div['class']:
msg_user = last_user
else:
msg_user = temp_soup.find('div', class_="from_name").text.strip()
last_user = msg_user
users_name.add(msg_user)
try:
msg_text = temp_soup.find('div', class_="text").text.strip()
messages_data.append({
'name': msg_user,
'text': msg_text.lower()
})
except Exception as e:
pass
print(f'{len(messages_data)} messages in {os.path.split(url)[1]}')
return messages_data
def get_data_from_path(path):
data = []
for filename in os.listdir(path):
if filename.endswith('.html'):
data = list(itt.chain(data,
get_data(os.path.join(path, filename))))
else:
continue
print('Общее количество сообщений - ' + str(len(data)))
return data
def parse_data(data):
words_by_name = {}
for name in users_name:
user_data = [el['text'] for el in data if el['name'] == name]
words_by_name[name] = []
for msg in user_data:
words_by_name[name] = list(itt.chain(words_by_name[name], re.findall(r'\w+', msg)))
print('Участники чата: ' + str(users_name))
return words_by_name
def show_wordCloud(wordCloud, name):
plt.figure(num=name)
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis("off")
plt.show()
def create_wordCloud(words):
for user_words in words.items():
print('Создается облако слов...')
wordCloud = WordCloud(width=500,
height=500,
max_words=1000,
min_font_size=3,
background_color="white").generate(' '.join(user_words[1]))
correct_name = user_words[0].split(' ')[0]
wordCloud.to_file(f"{int(time.time())}.png")
# show_wordCloud(wordCloud, user_words[0])
print('Успешно!')
def main():
data = get_data_from_path(r'C:\Users\Ivan\Downloads\Telegram Desktop\ChatExport_2021-03-29')
parsed_words = parse_data(data)
create_wordCloud(parsed_words)
if __name__ == '__main__':
main()