-
Notifications
You must be signed in to change notification settings - Fork 106
/
predict_topic_model.py
109 lines (84 loc) · 3.95 KB
/
predict_topic_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
import graphlab as gl
def predict(document_bow, word_topic_counts, topic_counts, vocab,
alpha=0.1, beta=0.01, num_burnin=5):
"""
Make predictions for a single document.
Parameters
----------
document_bow : dict
Dictionary with words as keys and document frequencies as counts.
word_topic_counts : numpy array, num_vocab x num_topics
Number of times a given word has ever been assigned to a topic.
topic_counts : numpy vector of length num_topics
Number of times any word has been assigned to a topic.
vocab : dict
Words are keys and unique integer is the value.
alpha : float
Hyperparameter. See topic_model docs.
beta : float
Hyperparameter. See topic_model docs.
num_burnin : int
Number of iterations of Gibbs sampling to perform at predict time.
Returns
-------
out : numpy array of length num_topics
Probabilities that the document belongs to each topic.
"""
num_vocab, num_topics = word_topic_counts.shape
# proportion of each topic in this test doc
doc_topic_counts = np.zeros(num_topics)
# Assignment of each unique word
doc_topic_assignments = []
# Initialize assignments and counts
# NB: we are assuming document_bow doesn't change.
for i, (word, freq) in enumerate(document_bow.iteritems()):
if word not in vocab: # skip words not present in training set
continue
topic = np.random.randint(0, num_topics-1)
doc_topic_assignments.append(topic)
doc_topic_counts[topic] += freq
# Sample topic assignments for the test document
for burnin in range(num_burnin):
for i, (word, freq) in enumerate(document_bow.iteritems()):
if word not in vocab:
continue
word_id = vocab[word]
# Get old topic and decrement counts
topic = doc_topic_assignments[i]
doc_topic_counts[topic] -= freq
# Sample a new topic
gamma = np.zeros(num_topics) # store probabilities
for k in range(num_topics):
gamma[k] = (doc_topic_counts[k] + alpha) * (word_topic_counts[word_id, k] + beta) / (topic_counts[k] + num_vocab * beta)
gamma = gamma / gamma.sum() # normalize to probabilities
topic = np.random.choice(num_topics, 1, p=gamma)
# Use new topic to increment counts
doc_topic_assignments[i] = topic
doc_topic_counts[topic] += freq
# Create predictions
predictions = np.zeros(num_topics)
total_doc_topic_counts = doc_topic_counts.sum()
for k in range(num_topics):
predictions[k] = (doc_topic_counts[k] + alpha) / (total_doc_topic_counts + num_topics * alpha)
return predictions / predictions.sum()
if __name__ == '__main__':
docs = gl.SFrame({'text': [{'first': 5, 'doc': 1}, {'second': 3, 'doc': 5}]})
m = gl.topic_model.create(docs)
# Get test document in bag of words format
document_bow = docs['text'][0]
# Input: Global parameters from trained model
# Number of times each word in the vocabulary has ever been assigned to topic k (in any document). You can make an approximate version of this by multiplying m['topics'] by some large number (e.g. number of tokens in corpus) that indicates how strong you "believe" in these topics. Make it into counts by flooring it to an integer.
prior_strength = 1000000
word_topic_counts = np.array(m['topics']['topic_probabilities'])
word_topic_counts = np.floor(prior_strength * word_topic_counts)
# Number of times any word as been assigned to each topic.
topic_counts = word_topic_counts.sum(0)
# Get vocabulary lookup
num_topics = m['num_topics']
vocab = {}
for i, w in enumerate(m['topics']['vocabulary']):
vocab[w] = i
num_vocab = len(vocab)
# Make prediction on test document
probs = predict(document_bow, word_topic_counts, topic_counts, vocab)