-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
61 lines (45 loc) · 1.51 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
from wordcloud import WordCloud
def preprocess_sentence():
""" Load spacy and return preprocessing function.
"""
nlp = spacy.load('en_core_web_md')
def preprocess_sentence_(sentence):
""" Preprocessing function: remove stop words and punctuation.
"""
return ' '.join([token.lemma_ for token in nlp(sentence) \
if not token.is_stop and not token.is_punct])
return preprocess_sentence_
def process_data(df, key, path=None):
""" Clean and dump motivations from a dataframe.
"""
# Extract columns.
data = df[key]
# Preprocess each sentence.
processed_data = data.apply(preprocess_sentence())
# Drop potential empty strings due to preprocessing.
data = data[processed_data != '']
processed_data = processed_data[processed_data != '']
# Create new DataFrame.
df = pd.DataFrame()
df[key] = data
df[f'processed_{key}'] = processed_data
if path:
df.to_csv(path, index=False)
return df
def print_wordcloud(top_words, scores, topic_index, topic_name, num_topics):
""" Plot wordcloud with top scoring words.
"""
freq = {}
for word, score in zip(top_words, scores):
freq[word] = int(score)
# Create and generate a word cloud image.
wordcloud = WordCloud(background_color='white').generate_from_frequencies(freq)
# Display the generated image.
plt.subplot(np.ceil(num_topics / 2.).astype(np.int), 2, topic_index + 1)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(topic_name)
plt.axis("off")