forked from justmarkham/DAT7
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path14_yelp_text.py
67 lines (56 loc) · 2.52 KB
/
14_yelp_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
'''
HOMEWORK: Yelp Review Text
'''
# TASK 1: read yelp.csv into a DataFrame
import pandas as pd
yelp = pd.read_csv('yelp.csv')
# TASK 2: create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
# TASK 3: split the new DataFrame into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(yelp_best_worst.text, yelp_best_worst.stars, random_state=1)
# TASK 4: use CountVectorizer to create document-term matrices from X_train and X_test
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)
# TASK 5: use Naive Bayes to predict the star rating for the testing set, and calculate accuracy
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)
# TASK 6: calculate the AUC
y_pred_prob = nb.predict_proba(test_dtm)[:, 1]
import numpy as np
y_test_binary = np.where(y_test==5, 1, 0)
print metrics.roc_auc_score(y_test_binary, y_pred_prob)
# TASK 7: plot the ROC curve
import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test_binary, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
# TASK 8: print the confusion matrix, and calculate sensitivity and specificity
print metrics.confusion_matrix(y_test, y_pred_class)
813 / float(25 + 813) # sensitivity
126 / float(126 + 58) # specificity
# TASK 9: browse the review text for the false positive and false negatives
X_test[y_test < y_pred_class] # false positives
X_test[y_test > y_pred_class] # false negatives
# TASK 10: change the threshold to balance sensitivity and specificity
y_pred_class = np.where(y_pred_prob > 0.999, 5, 1)
print metrics.confusion_matrix(y_test, y_pred_class)
723 / float(115 + 723) # sensitivity
162 / float(162 + 22) # specificity
# TASK 11 (BONUS): 5-class classification on the original DataFrame
X_train, X_test, y_train, y_test = train_test_split(yelp.text, yelp.stars, random_state=1)
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print metrics.accuracy_score(y_test, y_pred_class)
print metrics.confusion_matrix(y_test, y_pred_class)