-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAT&T NLP mock_test data preprocessing.py
65 lines (45 loc) · 1.74 KB
/
AT&T NLP mock_test data preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import re
dataset = pd.read_csv('AT&T_Data.csv')
dataset['Reviews'][0]
processed_review = []
review = re.sub('\'','',dataset['Reviews'][0])
for i in range(113):
review = re.sub('\'','',dataset['Reviews'][i])
review = re.sub('[^a-zA-Z]',' ',review)
review = review.lower()
review = review.split()
review = [ps.stem(token) for token in review if not token in stopwords.words('english')]
review = ' '.join(review)
processed_review.append(review)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 7000)
X = cv.fit_transform(processed_review)
X = X.toarray()
temp = dataset['Label'].values
from sklearn.preprocessing import LabelEncoder
lab = LabelEncoder()
temp = lab.fit_transform(temp)
y = temp
print(cv.get_feature_names())
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.naive_bayes import GaussianNB
n_b = GaussianNB()
n_b.fit(X_train, y_train)
print(n_b.score(X_train, y_train))
print(n_b.score(X_test, y_test))
print(n_b.score(X, y))
#As many times we run this code the score will change
y_pred= n_b.predict(X_test)
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score,confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(precision_score(y_test,y_pred,average = 'micro'))
print(recall_score(y_test,y_pred,average = 'micro'))
print(f1_score(y_test,y_pred,average = 'micro'))
print(classification_report(y_test, y_pred))