-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLDA.py
121 lines (90 loc) · 4.99 KB
/
LDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import matplotlib.pyplot as plt
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import pickle
from pathlib import Path
#Create a function to build the optimal LDA model
def ideal_model(dataframe, column_name):
'''
INPUTS:
dataframe - dataframe that contains the reviews
column_name: name of column that contains reviews
OUTPUTS:
lda_tfidf - Latent Dirichlet Allocation (LDA) model
dtm_tfidf - document-term matrix in the tfidf format
tfidf_vectorizer - word frequency in the reviews
A graph comparing LDA Model Performance Scores with different params
'''
text_raw = dataframe[column_name].tolist()
for n in range(len(text_raw)):
if type(text_raw[n])==float:
text_raw[n]=str(text_raw[n])
else:
continue
#*********** Step 1: Convert to document-term matrix ************#
#Transform text to vector form using the vectorizer object
text_vectorizer = CountVectorizer(strip_accents = 'unicode',
stop_words = 'english',
lowercase = True,
token_pattern = r'\b[a-zA-Z]{3,}\b', # num chars > 3 to avoid some meaningless words
max_df = 0.9, # discard words that appear in > 90% of the reviews
min_df = 10) # discard words that appear in < 10 reviews
#apply transformation
text_matrix_tfidf = TfidfVectorizer(**text_vectorizer.get_params())
#convert to document-term matrix
dtm_text_matrix = text_matrix_tfidf.fit_transform(text_raw)
print("The shape of the tfidf is {}, meaning that there are {} {} and {} tokens made through the filtering process.".\
format(dtm_text_matrix.shape,dtm_text_matrix.shape[0], column_name, dtm_text_matrix.shape[1]))
#******* Step 2: GridSearch & parameter tuning to find the optimal LDA model *******#
# Define the range of values for each parameter in which to determine optimal parameters from
test_params = {'n_components': [4, 8, 12, 16, 20, 24, 28],
'learning_decay': [.5, .7, .9]}
# Initialize the Model
lda = LatentDirichletAllocation()
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=test_params)
# Do the Grid Search
model.fit(dtm_text_matrix)
#***** Step 3: Output the optimal lda model and its parameters *****#
# Best Model
optimal_lda_model = model.best_estimator_
# Model Parameters
print("Optimal Model's Params: ", model.best_params_)
# Log Likelihood Score: Higher the better
print("Model Log Likelihood Score: ", model.best_score_)
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Model Perplexity: ", optimal_lda_model.perplexity(dtm_text_matrix))
#*********** Step 4: Compare LDA Model Performance Scores ***********#
#Get Log Likelyhoods from Grid Search Output
gscore=model.fit(dtm_text_matrix).cv_results_
n_topics = [4, 8, 12, 16, 20, 24, 28]
log_likelihood_dec5 = [gscore['mean_test_score'][gscore['params'].index(v)] for v in gscore['params'] if v['learning_decay']==0.5]
log_likelihood_dec7 = [gscore['mean_test_score'][gscore['params'].index(v)] for v in gscore['params'] if v['learning_decay']==0.7]
log_likelihood_dec9 = [gscore['mean_test_score'][gscore['params'].index(v)] for v in gscore['params'] if v['learning_decay']==0.9]
# Show graph
plt.figure(figsize=(20, 12))
plt.plot(n_topics, log_likelihood_dec5, label='0.5')
plt.plot(n_topics, log_likelihood_dec7, label='0.7')
plt.plot(n_topics, log_likelihood_dec9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelihood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()
return optimal_lda_model, dtm_text_matrix, text_matrix_tfidf
def lda(data):
# Combine data of all customer reviews into a single column for easier data extraction
rev = data[["TITLE","REVIEW"]]
rev["feedback"]=rev["TITLE"]+" "+rev["REVIEW"]
# Obtain Optimal Model & Proccessed Data
optimal_lda_model, dtm_text_matrix, text_matrix_tfidf = ideal_model(rev, "feedback")
# Visaulise Topics of Data and top 30 words of each topic
optimal_model_vis = pyLDAvis.sklearn.prepare(optimal_lda_model, dtm_text_matrix, text_matrix_tfidf)
# Save topics to html file to be loaded onto website
pyLDAvis.save_html(optimal_model_vis, 'templates/pickle_lda_model_sklearn.html')