-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
35 lines (30 loc) · 1.58 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
from nltk.tokenize import word_tokenize
# Function to preprocess and tokenize a given message
def clean_message(message):
message = message.lower()
message = re.sub(r'[^a-zA-Z0-9\s]', '', message)
return word_tokenize(message)
# Calculate the smoothed probability of a word in a given category
def calculate_word_probability(word, word_count, total_words, vocabulary):
laplace_smoothing = 1
word_occurrences = word_count[word]
return (word_occurrences + laplace_smoothing) / (len(total_words) + len(vocabulary))
# Function to classify messages in the test dataset as ham or spam
def classify_messages(test_data, ham_word_count, spam_word_count, ham_words, spam_words, vocabulary):
p_spam = len(spam_words) / (len(spam_words) + len(ham_words))
p_ham = len(ham_words) / (len(spam_words) + len(ham_words))
message_prediction = []
for _, row in test_data.iterrows():
# Initialize probability product with prior probabilities
p_word_spam_product = p_spam
p_word_ham_product = p_ham
for word in clean_message(row["message"]):
# Multiply the word probabilities to get message probability
p_word_spam_product *= calculate_word_probability(word, spam_word_count, spam_words, vocabulary)
p_word_ham_product *= calculate_word_probability(word, ham_word_count, ham_words, vocabulary)
if p_word_spam_product > p_word_ham_product:
message_prediction.append('spam')
else:
message_prediction.append('ham')
test_data.insert(0, 'predicted_label', message_prediction)