-
Notifications
You must be signed in to change notification settings - Fork 2
/
create_wordcloud.py
95 lines (88 loc) · 3.82 KB
/
create_wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# %%
from wordcloud import WordCloud
from mpl_toolkits.mplot3d import Axes3D
from spacy.lang.de.stop_words import STOP_WORDS as STOP_WORDS_DE
from spacy.lang.en.stop_words import STOP_WORDS as STOP_WORDS_EN
#from spacy.lang.pt.stop_words import STOP_WORDS as STOP_WORDS_PT
#from spacy.lang.fr.stop_words import STOP_WORDS as STOP_WORDS_FR
import pandas as pd
import numpy as np
import spacy
import csv
import json
nlp = spacy.load('de_core_news_sm')
#nlp = spacy.load('pt_core_news_sm')
#nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load("fr_core_news_sm")
# Polish Spacy Model: https://github.com/ipipan/spacy-pl
#nlp = spacy.load('pl_spacy_model') # or spacy.load('pl_spacy_model_morfeusz')
# Polish Stop Words: https://github.com/Alir3z4/python-stop-words
#from stop_words import get_stop_words
#STOP_WORDS_PL = get_stop_words('pl')
base_path = '/Users/philippbolte/Documents/FollowerNetworkTwitter'
def get_dict_from_texts(texts, filter_words, min_word_length):
text_dict = {}
for text in texts:
# Do all lower case
text_lowercase = nlp(text.lower())
# Lemmatization
text_lemma = ''
for word in text_lowercase:
text_lemma += word.lemma_
text_lemma += ' '
text_lemma = nlp(text_lemma) #Fix type
# Filter points
word_list = list(map(lambda x: x.text, filter(lambda x: x.pos_ != 'PUNCT', text_lemma)))
# Filter stop words
word_list_cleaned = [tok for tok in word_list if tok not in STOP_WORDS_DE]
word_list_cleaned = [tok for tok in word_list_cleaned if tok not in STOP_WORDS_EN]
#word_list_cleaned = [tok for tok in word_list_cleaned if tok not in STOP_WORDS_DE]
# Filter short words
word_list_cleaned = [tok for tok in word_list_cleaned if len(tok) >= min_word_length]
# Filter filter words
word_list_cleaned = [tok for tok in word_list_cleaned if not any(fword in tok for fword in filter_words)]
# Build dict
for term in word_list_cleaned:
if term not in text_dict.keys():
text_dict[term] = 1
else:
text_dict[term] += 1
return text_dict
def save_wordcloud(text_dict, max_words, path):
wc = WordCloud(height=1080, width=1920, background_color="white", max_words=max_words)
wc.generate_from_frequencies(text_dict)
wc.to_file(path)
#plt.figure()
#plt.imshow(wc)
#plt.axis("off")
#plt.pause(1)
def load_csv(_filename):
with open(base_path + '/data/tweets'+_filename, newline='') as f:
reader = csv.reader(f)
data = list(reader)
return data
def save_dict_as_json(_text_dict, _path):
with open(_path, 'w', encoding='utf8') as fp:
json.dump(_text_dict, fp, indent=4, ensure_ascii=False)
# %%
influential_users_by_party = [{'Die_Gruenen': load_csv('/Die_Gruenen_cluster/influential_users.csv')}]
# %%
# -*- coding: utf-8 -*-
for influential_user in influential_users_by_party:
party = list(influential_user.keys())[0]
users = list(influential_user.values())[0]
for user in users:
print('Do '+user[0])
cluster = user[1]
name = user[0]
print('\_____ Get tweets')
df_tweets = pd.read_csv(base_path + '/data/tweets/'+party+"_cluster/"+str(cluster)+"/"+name+"_tweets.csv")
texts = df_tweets['text']
filter_words = ['https', 'http', '@', 'amp', 'all', 'jed', 'link', 'mal','prof', 'welch', 'ander', '-PRON-']
print('\_____ Build Dict')
text_dict = get_dict_from_texts(texts, filter_words, 3)
print('\_____ Save Dict')
save_dict_as_json(text_dict, base_path + '/data/tweets/'+party+'_cluster/'+str(cluster)+'/'+name+'_text_dict.json')
print('\_____ Generate WC')
save_wordcloud(text_dict, 50, base_path + '/data/tweets/'+party+'_cluster/'+str(cluster)+'/'+name+'_wordcloud.png')
# %%