email-spam-classifier
May 4, 2024
[1]: import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
%matplotlib inline
import string
from [Link] import stopwords
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from [Link] import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from [Link] import roc_curve, auc
from sklearn import metrics
from sklearn import model_selection
from sklearn import svm
from nltk import word_tokenize
from [Link] import roc_auc_score
from matplotlib import pyplot
from [Link] import plot_confusion_matrix
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The [Link] rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle: Support for setting the
'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed
two minor releases later; use '[Link] : 'cm' instead.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will
1
be removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed
two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
[2]: #Parent Class for Data
class data_read_write(object):
def __init__(self):
pass
def __init__(self, file_link):
self.data_frame = pd.read_csv(file_link)
def read_csv_file(self, file_link):
#data_frame_read = pd.read_csv(file_link)
#return data_frame_read
#self.data_frame = pd.read_csv(file_link)
return self.data_frame
def write_to_csvfile(self, file_link):
self.data_frame.to_csv(file_link, encoding='utf-8', index=False,␣
↪header=True)
return
[3]: #Child Class for Data_read_write
class generate_word_cloud(data_read_write):
def __init__(self):
pass
#Child own Function
def variance_column(self, data):
return variance(data)
#Polymorphism
def word_cloud(self, data_frame_column, output_image_file):
text = " ".join(review for review in data_frame_column)
stopwords = set(STOPWORDS)
[Link](["subject"])
2
wordcloud = WordCloud(width = 1200, height = 800, stopwords=stopwords,␣
↪max_font_size = 50, margin=0, background_color = "white").generate(text)
[Link](wordcloud, interpolation='bilinear')
[Link]("off")
[Link]()
wordcloud.to_file(output_image_file)
return
[4]: #Child Class for Data_read_write
class data_cleaning(data_read_write):
def __init__(self):
pass
def message_cleaning(self, message):
Test_punc_removed = [char for char in message if char not in string.
↪punctuation]
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join_clean = [word for word in␣
↪Test_punc_removed_join.split() if [Link]() not in stopwords.
↪words('english')]
final_join = ' '.join(Test_punc_removed_join_clean)
return final_join
def apply_to_column(self, data_column_text):
data_processed = data_column_text.apply(self.message_cleaning)
return data_processed
[5]: #Child Class for Data_read_write
class apply_embeddding_and_model(data_read_write):
def __init__(self):
pass
def apply_count_vector(self, v_data_column):
vectorizer = CountVectorizer(min_df=2,analyzer = "word",tokenizer =␣
↪None,preprocessor = None,stop_words = None)
return vectorizer.fit_transform(v_data_column)
def apply_naive_bayes(self, X, y):
#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#[Link](cm, annot=True)
#Evaluating Model
3
print(classification_report(y_test, y_predict_test))
print("test set")
print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣
↪y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test, y_predict_test)))
print("Recall: " + str(metrics.recall_score(y_test, y_predict_test)))
print("Precision: " + str(metrics.precision_score(y_test,␣
↪y_predict_test)))
class_names = ['ham', 'spam']
titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(NB_classifier, X_test, y_test,
display_labels=class_names,
cmap=[Link],
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
[Link]()
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = NB_classifier.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Naive Bayes: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
[Link](ns_fpr, ns_tpr, linestyle='--', label='No Skill')
[Link](lr_fpr, lr_tpr, marker='.', label='Naive Bayes')
# axis labels
[Link]('False Positive Rate')
[Link]('True Positive Rate')
# show the legend
[Link]()
# show the plot
4
[Link]()
return
def apply_svm(self, X, y):
#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
#'linear', 'poly', 'rbf'
params = {'kernel': 'linear', 'C': 2, 'gamma': 1}
svm_cv = [Link](C=params['C'], kernel=params['kernel'],␣
↪gamma=params['gamma'], probability=True)
svm_cv.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = svm_cv.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#[Link](cm, annot=True)
#Evaluating Model
print(classification_report(y_test, y_predict_test))
print("test set")
print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣
↪y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test, y_predict_test)))
print("Recall: " + str(metrics.recall_score(y_test, y_predict_test)))
print("Precision: " + str(metrics.precision_score(y_test,␣
↪y_predict_test)))
class_names = ['ham', 'spam']
titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(svm_cv, X_test, y_test,
display_labels=class_names,
cmap=[Link],
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
[Link]()
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = svm_cv.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
5
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
[Link](ns_fpr, ns_tpr, linestyle='--', label='No Skill')
[Link](lr_fpr, lr_tpr, marker='.', label='SVM')
# axis labels
[Link]('False Positive Rate')
[Link]('True Positive Rate')
# show the legend
[Link]()
# show the plot
[Link]()
return
[6]: data_obj = data_read_write("[Link]")
[7]: data_frame = data_obj.read_csv_file("[Link]")
data_frame.head()
data_frame.tail()
data_frame.describe()
data_frame.info()
<class '[Link]'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
text 5728 non-null object
spam 5728 non-null int64
dtypes: int64(1), object(1)
memory usage: 89.6+ KB
[8]: data_frame.head()
[8]: text spam
0 Subject: naturally irresistible your corporate… 1
1 Subject: the stock trading gunslinger fanny i… 1
2 Subject: unbelievable new homes made easy im … 1
3 Subject: 4 color printing special request add… 1
4 Subject: do not have money , get software cds … 1
[9]: #Visualize dataset
# Let's see which message is the most popular ham/spam message
6
data_frame.groupby('spam').describe()
[9]: text
count unique top freq
spam
0 4360 4327 Subject: tiger evals - attachment tiger hosts… 2
1 1368 1368 Subject: localized software , all languages av… 1
[10]: # Let's get the length of the messages
data_frame['length'] = data_frame['text'].apply(len)
data_frame['length'].max()
[10]: 43952
[11]: #data_frame['length'].plot(bins=100, kind='hist')
#Length of characters for ham emails is more as compared to spam emails
[Link](rc={'[Link]':(11.7,8.27)})
ham_messages_length = data_frame[data_frame['spam']==0]
spam_messages_length = data_frame[data_frame['spam']==1]
ham_messages_length['length'].plot(bins=100, kind='hist',label = 'Ham')
spam_messages_length['length'].plot(bins=100, kind='hist',label = 'Spam')
#[Link](ham_messages_length['length'], bins=10, norm_hist = True, label =␣
↪'Ham')
#[Link](spam_messages_length['length'], bins=10, norm_hist = True, label␣
↪= 'Spam')
[Link]('Distribution of Length of Email Text')
[Link]('Length of Email Text')
[Link]()
#ax = [Link](ham_words_length, norm_hist = True, bins = 30, label = 'Ham')
#ax = [Link](spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')
#[Link]()
#[Link]('Distribution of Number of Words')
#[Link]('Number of Words')
#[Link]()
[11]: <[Link] at 0x2e158719f88>
7
[12]: #data_frame['spam']==0
data_frame[data_frame['spam']==0].[Link]
ham_words_length = [len(word_tokenize(title)) for title in␣
↪data_frame[data_frame['spam']==0].[Link]]
spam_words_length = [len(word_tokenize(title)) for title in␣
↪data_frame[data_frame['spam']==1].[Link]]
print(max(ham_words_length))
print(max(spam_words_length))
8479
6131
[13]: #There is spike in spam emails with less number of words
#Even when our dataset include 24 percent of spam emails out of total emails-
#Looks like Spam emails have less words as compared to ham emails
[Link](rc={'[Link]':(11.7,8.27)})
ax = [Link](ham_words_length, norm_hist = True, bins = 30, label = 'Ham')
ax = [Link](spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')
#ham_words_length.plot(bins=100, kind='hist',label = 'Ham')
#spam_words_length.plot(bins=100, kind='hist',label = 'Spam')
8
[Link]('Distribution of Number of Words')
[Link]('Number of Words')
[Link]()
[Link]()
[14]: def mean_word_length(x):
word_lengths = [Link]([])
for word in word_tokenize(x):
word_lengths = [Link](word_lengths, len(word))
return word_lengths.mean()
ham_meanword_length = data_frame[data_frame['spam']==0].text.
↪apply(mean_word_length)
spam_meanword_length = data_frame[data_frame['spam']==1].text.
↪apply(mean_word_length)
[Link](ham_meanword_length, norm_hist = True, bins = 30, label = 'Ham')
9
[Link](spam_meanword_length , norm_hist = True, bins = 30, label = 'Spam')
[Link]('Distribution of Mean Word Length')
[Link]('Mean Word Length')
[Link]()
[Link]()
#There is not a significant difference for the length of words used by ham and␣
↪spam emails
[15]: #Checking ratio of stop words
#Both spam and ham email contain stopwords
#All Spam emails contain stop words with a mean of 0.281
#All Ham emails contain stop words with a mean of 0.278
#But we can see from the graph, spam email contain high stop words ratio as␣
↪compared to ham emails.
from [Link] import stopwords
stop_words = set([Link]('english'))
def stop_words_ratio(x):
num_total_words = 0
10
num_stop_words = 0
for word in word_tokenize(x):
if word in stop_words:
num_stop_words += 1
num_total_words += 1
return num_stop_words/num_total_words
ham_stopwords = data_frame[data_frame['spam']==0].[Link](stop_words_ratio)
spam_stopwords = data_frame[data_frame['spam']==1].[Link](stop_words_ratio)
[Link](ham_stopwords, norm_hist = True, label = 'Ham')
[Link](spam_stopwords, label = 'Spam')
print('Ham Mean: {:.3f}'.format(ham_stopwords.[Link]()))
print('Spam Mean: {:.3f}'.format(spam_stopwords.[Link]()))
[Link]('Distribution of Stop-word Ratio')
[Link]('Stop Word Ratio')
[Link]()
Ham Mean: 0.278
Spam Mean: 0.281
[15]: <[Link] at 0x2e15f4d2848>
11
[16]: spam_stopwords
[16]: 0 0.230769
1 0.277778
2 0.397727
3 0.191919
4 0.396226
…
1363 0.342105
1364 0.365854
1365 0.437500
1366 0.446809
1367 0.320024
Name: text, Length: 1368, dtype: float64
[33]: # Let's divide the messages into spam and ham
ham = data_frame[data_frame['spam']==0]
spam = data_frame[data_frame['spam']==1]
spam['length'].plot(bins=60, kind='hist')
ham['length'].plot(bins=60, kind='hist')
data_frame['Ham(0) and Spam(1)'] = data_frame['spam']
12
print( 'Spam percentage =', (len(spam) / len(data_frame) )*100,"%")
print( 'Ham percentage =', (len(ham) / len(data_frame) )*100,"%")
[Link](data_frame['Ham(0) and Spam(1)'], label = "Count")
#word_cloud_obj = generate_word_cloud()
#word_cloud_obj.word_cloud(ham["clean_text"], "ham_word_cloud.png")
#word_cloud_obj.word_cloud(spam["clean_text"], "spam_word_cloud.png")
#text_spam = " ".join(review for review in spam["clean_text"])
Spam percentage = 23.88268156424581 %
Ham percentage = 76.11731843575419 %
[33]: <AxesSubplot:xlabel='Ham(0) and Spam(1)', ylabel='count'>
[18]: word_cloud_obj = generate_word_cloud()
word_cloud_obj.word_cloud(ham["text"], "ham_word_cloud.png")
word_cloud_obj.word_cloud(spam["text"], "spam_word_cloud.png")
13
14
[19]: data_clean_obj = data_cleaning()
# Let's test the newly added function
#data_frame['clean_text'] = data_frame['text'].apply(message_cleaning)
#data_frame['clean_text'] = data_frame['text'].apply(data_clean_obj.
↪message_cleaning)
data_frame['clean_text'] = data_clean_obj.apply_to_column(data_frame['text'])
[20]: data_frame.head()
[20]: text spam length \
0 Subject: naturally irresistible your corporate… 1 1484
1 Subject: the stock trading gunslinger fanny i… 1 598
2 Subject: unbelievable new homes made easy im … 1 448
3 Subject: 4 color printing special request add… 1 500
4 Subject: do not have money , get software cds … 1 235
clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…
[21]: data_obj.data_frame.head()
[21]: text spam length \
0 Subject: naturally irresistible your corporate… 1 1484
1 Subject: the stock trading gunslinger fanny i… 1 598
2 Subject: unbelievable new homes made easy im … 1 448
3 Subject: 4 color printing special request add… 1 500
4 Subject: do not have money , get software cds … 1 235
clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…
[22]: data_obj.write_to_csvfile("processed_file.csv")
[23]: #APPLY COUNT VECTORIZER TO OUR MESSAGES LIST
# Define the cleaning pipeline we defined earlier
#vectorizer = CountVectorizer()
cv_object = apply_embeddding_and_model()
spamham_countvectorizer = cv_object.apply_count_vector(data_frame['clean_text'])
15
[24]: #Separating Descriptive and Target Feature
X = spamham_countvectorizer
label = data_frame['spam'].values
y = label
[25]: cv_object.apply_naive_bayes(X,y)
precision recall f1-score support
0 1.00 0.99 0.99 901
1 0.98 0.99 0.98 245
accuracy 0.99 1146
macro avg 0.99 0.99 0.99 1146
weighted avg 0.99 0.99 0.99 1146
test set
Accuracy Score: 0.9921465968586387
F1 Score: 0.9817444219066936
Recall: 0.9877551020408163
Precision: 0.9758064516129032
Confusion matrix, without normalization
[[895 6]
[ 3 242]]
Normalized confusion matrix
[[0.99334073 0.00665927]
[0.0122449 0.9877551 ]]
16
17
No Skill: ROC AUC=0.500
Naive Bayes: ROC AUC=0.998
18
[26]: cv_object.apply_svm(X,y)
precision recall f1-score support
0 0.99 0.99 0.99 901
1 0.98 0.98 0.98 245
accuracy 0.99 1146
macro avg 0.99 0.98 0.99 1146
weighted avg 0.99 0.99 0.99 1146
test set
Accuracy Score: 0.9904013961605584
F1 Score: 0.9775051124744377
Recall: 0.9755102040816327
Precision: 0.9795081967213115
Confusion matrix, without normalization
[[896 5]
[ 6 239]]
Normalized confusion matrix
[[0.99445061 0.00554939]
19
[0.0244898 0.9755102 ]]
20
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.998
21
[ ]:
[ ]:
22