-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmovie_reviews2.py
92 lines (72 loc) · 3.54 KB
/
movie_reviews2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import Counter
import string
# Load the IMDB dataset using Keras
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data(num_words=10000)
# Create a word index mapping
word_index = tf.keras.datasets.imdb.get_word_index()
word_index = {k: v + 3 for k, v in word_index.items()} # Offset the index to reserve 0, 1, 2
word_index['<PAD>'] = 0 # Padding
word_index['<START>'] = 1 # Start token
word_index['<UNK>'] = 2 # Unknown token
reverse_word_index = {value: key for key, value in word_index.items()}
# Function to decode reviews
def decode_review(encoded_review):
return ' '.join([reverse_word_index.get(i, '?') for i in encoded_review])
# Preprocess the text data
def preprocess_text(text):
# Remove punctuation
text = ''.join([char for char in text if char not in string.punctuation])
# Tokenize
words = text.split()
return words # return a list of words with no punctuation
# Preprocess the training and testing reviews
train_reviews = [decode_review(review) for review in train_data]
test_reviews = [decode_review(review) for review in test_data]
# Apply the preprocessing function
train_reviews = [preprocess_text(review) for review in train_reviews]
test_reviews = [preprocess_text(review) for review in test_reviews]
# Create a DataFrame for training and testing data
train_df = pd.DataFrame({'review': train_reviews, 'sentiment': train_labels})
test_df = pd.DataFrame({'review': test_reviews, 'sentiment': test_labels})
# Build a simple Naive Bayes classifier
class NaiveBayesClassifier:
def __init__(self):
self.class_probabilities = {}
self.word_probabilities = {}
def train(self, data):
total_docs = len(data)
# Calculate class probabilities
sentiment_counts = Counter(data['sentiment'])
for sentiment, count in sentiment_counts.items():
self.class_probabilities[sentiment] = count / total_docs
# Calculate word probabilities
for sentiment, group in data.groupby('sentiment'):
word_counts = Counter(word for review in group['review'] for word in review)
total_words = sum(word_counts.values())
self.word_probabilities[sentiment] = {word: count / total_words for word, count in word_counts.items()}
def predict(self, review):
scores = {sentiment: self.class_probabilities[sentiment] for sentiment in self.class_probabilities}
for word in review:
for sentiment in self.class_probabilities:
if word in self.word_probabilities[sentiment]:
scores[sentiment] *= self.word_probabilities[sentiment][word]
return max(scores, key=scores.get)
# Train the Naive Bayes classifier
classifier = NaiveBayesClassifier()
classifier.train(train_df)
# Make predictions
predictions = [classifier.predict(review) for review in test_df['review']]
# Evaluate the model
correct_predictions = sum(predictions[i] == test_df.iloc[i]['sentiment'] for i in range(len(test_df)))
accuracy = correct_predictions / len(test_df)
print(f'Accuracy: {accuracy:.4f}')
# Predict sentiment for a new review
new_review = input("Enter movie review: ")
new_review = preprocess_text(new_review)
prediction = classifier.predict(new_review)
# Map predictions to human-readable labels
sentiment_label = "Positive" if prediction == 1 else "Negative"
print(f'Predicted sentiment for the new review: {sentiment_label}')