-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmovie_reviews3.py
75 lines (60 loc) · 3 KB
/
movie_reviews3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb
import matplotlib.pyplot as plt
# Set vocabulary size and maximum sequence length
vocab_size = 10000 # Only consider the top 10,000 words
max_length = 200 # Maximum length of reviews
# Load the IMDB dataset using Keras
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)
# Function to preprocess the data by padding sequences
def preprocess_data(X):
return pad_sequences(X, maxlen=max_length, padding='post', truncating='post')
# Preprocess the training and testing data
X_train_padded = preprocess_data(X_train)
X_test_padded = preprocess_data(X_test)
# Function to train the deep learning model
def train_model(X_train, y_train):
# Build model
model = Sequential([
Embedding(vocab_size, 32, input_length=max_length), # Embedding layer
Flatten(), # Flatten layer
Dense(1, activation='sigmoid') # Dense layer, sigmoid for binary classification
])
# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=5, validation_split=0.2) # Validation split of 20%
return model
# Train the deep learning model
model = train_model(X_train_padded, y_train)
# Evaluate the deep learning model
def evaluate_model(model, X_test, y_test):
_, accuracy = model.evaluate(X_test, y_test)
return accuracy
# Evaluate the model's accuracy
accuracy = evaluate_model(model, X_test_padded, y_test)
print(f'Accuracy: {accuracy}')
# Predict sentiment for a new review
def predict_sentiment(model, review):
# The IMDB dataset returns integers instead of words, so we need to encode the new review accordingly.
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
# Fit the tokenizer on the IMDB dataset (here's a workaround to demonstrate prediction)
tokenizer.fit_on_texts([' '.join(imdb.get_word_index().keys())]) # Dummy fit to initialize tokenizer
# Preprocess the new review
new_review_sequence = tokenizer.texts_to_sequences([review])
new_review_padded = pad_sequences(new_review_sequence, maxlen=max_length, padding='post', truncating='post')
# Make the prediction
predicted_prob = model.predict(new_review_padded)[0][0]
predicted_sentiment = 'positive' if predicted_prob >= 0.5 else 'negative'
return predicted_sentiment
# Get user input for a new review
new_review = input("Enter movie review: ")
# Print the predicted sentiment for the new review
predicted_sentiment = predict_sentiment(model, new_review)
print(f'Predicted sentiment for the new review: {predicted_sentiment}')