0% found this document useful (0 votes)

67 views22 pages

Email Spam Classification System

The document discusses building models to classify emails as spam or ham. It imports various libraries and defines classes for reading data, cleaning text, applying embeddings and building classification models using naive bayes and SVM. Functions are used to preprocess text, vectorize data, train and evaluate models and generate classification metrics and confusion matrices.

Uploaded by

phenomenal beast

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

67 views22 pages

Email Spam Classification System

Uploaded by

phenomenal beast

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

email-spam-classifier

May 4, 2024

[1]: import pandas as pd

import numpy as np
import [Link] as plt
import seaborn as sns
%matplotlib inline
import string
from [Link] import stopwords
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from [Link] import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from [Link] import roc_curve, auc
from sklearn import metrics
from sklearn import model_selection
from sklearn import svm
from nltk import word_tokenize
from [Link] import roc_auc_score
from matplotlib import pyplot
from [Link] import plot_confusion_matrix

In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The [Link] rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle: Support for setting the
'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed
two minor releases later; use '[Link] : 'cm' instead.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will

1
be removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed
two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-
data\stylelib\_classic_test.mplstyle:
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be
removed two minor releases later.

[2]: #Parent Class for Data

class data_read_write(object):
def __init__(self):
pass
def __init__(self, file_link):
self.data_frame = pd.read_csv(file_link)
def read_csv_file(self, file_link):
#data_frame_read = pd.read_csv(file_link)
#return data_frame_read
#self.data_frame = pd.read_csv(file_link)
return self.data_frame
def write_to_csvfile(self, file_link):
self.data_frame.to_csv(file_link, encoding='utf-8', index=False,␣
↪header=True)

return

[3]: #Child Class for Data_read_write

class generate_word_cloud(data_read_write):
def __init__(self):
pass
#Child own Function
def variance_column(self, data):
return variance(data)
#Polymorphism
def word_cloud(self, data_frame_column, output_image_file):
text = " ".join(review for review in data_frame_column)
stopwords = set(STOPWORDS)
[Link](["subject"])

2
wordcloud = WordCloud(width = 1200, height = 800, stopwords=stopwords,␣
↪max_font_size = 50, margin=0, background_color = "white").generate(text)
[Link](wordcloud, interpolation='bilinear')
[Link]("off")
[Link]()
wordcloud.to_file(output_image_file)
return

[4]: #Child Class for Data_read_write

class data_cleaning(data_read_write):
def __init__(self):
pass
def message_cleaning(self, message):
Test_punc_removed = [char for char in message if char not in string.
↪punctuation]

Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join_clean = [word for word in␣
↪Test_punc_removed_join.split() if [Link]() not in stopwords.

↪words('english')]

final_join = ' '.join(Test_punc_removed_join_clean)

return final_join

def apply_to_column(self, data_column_text):

data_processed = data_column_text.apply(self.message_cleaning)
return data_processed

[5]: #Child Class for Data_read_write

class apply_embeddding_and_model(data_read_write):
def __init__(self):
pass
def apply_count_vector(self, v_data_column):
vectorizer = CountVectorizer(min_df=2,analyzer = "word",tokenizer =␣
↪None,preprocessor = None,stop_words = None)

return vectorizer.fit_transform(v_data_column)

def apply_naive_bayes(self, X, y):

#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#[Link](cm, annot=True)
#Evaluating Model

3
print(classification_report(y_test, y_predict_test))
print("test set")

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

↪y_predict_test)))
print("F1 Score: " + str(metrics.f1_score(y_test, y_predict_test)))
print("Recall: " + str(metrics.recall_score(y_test, y_predict_test)))
print("Precision: " + str(metrics.precision_score(y_test,␣
↪y_predict_test)))

class_names = ['ham', 'spam']

titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(NB_classifier, X_test, y_test,
display_labels=class_names,
cmap=[Link],
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
[Link]()

# generate a no skill prediction (majority class)

ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = NB_classifier.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Naive Bayes: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
[Link](ns_fpr, ns_tpr, linestyle='--', label='No Skill')
[Link](lr_fpr, lr_tpr, marker='.', label='Naive Bayes')
# axis labels
[Link]('False Positive Rate')
[Link]('True Positive Rate')
# show the legend
[Link]()
# show the plot

4
[Link]()

return
def apply_svm(self, X, y):
#DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Training model
#'linear', 'poly', 'rbf'
params = {'kernel': 'linear', 'C': 2, 'gamma': 1}
svm_cv = [Link](C=params['C'], kernel=params['kernel'],␣
↪gamma=params['gamma'], probability=True)

svm_cv.fit(X_train, y_train)
# Predicting the Test set results
y_predict_test = svm_cv.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
#[Link](cm, annot=True)
#Evaluating Model
print(classification_report(y_test, y_predict_test))
print("test set")

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

class_names = ['ham', 'spam']

titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(svm_cv, X_test, y_test,
display_labels=class_names,
cmap=[Link],
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
[Link]()

# generate a no skill prediction (majority class)

ns_probs = [0 for _ in range(len(y_test))]
# predict probabilities
lr_probs = svm_cv.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores

5
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('SVM: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
[Link](ns_fpr, ns_tpr, linestyle='--', label='No Skill')
[Link](lr_fpr, lr_tpr, marker='.', label='SVM')
# axis labels
[Link]('False Positive Rate')
[Link]('True Positive Rate')
# show the legend
[Link]()
# show the plot
[Link]()
return

[6]: data_obj = data_read_write("[Link]")

[7]: data_frame = data_obj.read_csv_file("[Link]")

data_frame.head()
data_frame.tail()
data_frame.describe()
data_frame.info()

<class '[Link]'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
text 5728 non-null object
spam 5728 non-null int64
dtypes: int64(1), object(1)
memory usage: 89.6+ KB

[8]: data_frame.head()

[8]: text spam

0 Subject: naturally irresistible your corporate… 1
1 Subject: the stock trading gunslinger fanny i… 1
2 Subject: unbelievable new homes made easy im … 1
3 Subject: 4 color printing special request add… 1
4 Subject: do not have money , get software cds … 1

[9]: #Visualize dataset

# Let's see which message is the most popular ham/spam message

6
data_frame.groupby('spam').describe()

[9]: text
count unique top freq
spam
0 4360 4327 Subject: tiger evals - attachment tiger hosts… 2
1 1368 1368 Subject: localized software , all languages av… 1

[10]: # Let's get the length of the messages

data_frame['length'] = data_frame['text'].apply(len)
data_frame['length'].max()

[10]: 43952

[11]: #data_frame['length'].plot(bins=100, kind='hist')

#Length of characters for ham emails is more as compared to spam emails
[Link](rc={'[Link]':(11.7,8.27)})
ham_messages_length = data_frame[data_frame['spam']==0]
spam_messages_length = data_frame[data_frame['spam']==1]

ham_messages_length['length'].plot(bins=100, kind='hist',label = 'Ham')

spam_messages_length['length'].plot(bins=100, kind='hist',label = 'Spam')
#[Link](ham_messages_length['length'], bins=10, norm_hist = True, label =␣
↪'Ham')

#[Link](spam_messages_length['length'], bins=10, norm_hist = True, label␣

↪= 'Spam')

[Link]('Distribution of Length of Email Text')

[Link]('Length of Email Text')
[Link]()

#ax = [Link](ham_words_length, norm_hist = True, bins = 30, label = 'Ham')

#ax = [Link](spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')

#[Link]()
#[Link]('Distribution of Number of Words')
#[Link]('Number of Words')
#[Link]()

[11]: <[Link] at 0x2e158719f88>

7
[12]: #data_frame['spam']==0
data_frame[data_frame['spam']==0].[Link]

ham_words_length = [len(word_tokenize(title)) for title in␣

↪data_frame[data_frame['spam']==0].[Link]]

spam_words_length = [len(word_tokenize(title)) for title in␣

↪data_frame[data_frame['spam']==1].[Link]]

print(max(ham_words_length))
print(max(spam_words_length))

8479
6131

[13]: #There is spike in spam emails with less number of words

#Even when our dataset include 24 percent of spam emails out of total emails-
#Looks like Spam emails have less words as compared to ham emails
[Link](rc={'[Link]':(11.7,8.27)})
ax = [Link](ham_words_length, norm_hist = True, bins = 30, label = 'Ham')
ax = [Link](spam_words_length, norm_hist = True, bins = 30, label =␣
↪'Spam')

#ham_words_length.plot(bins=100, kind='hist',label = 'Ham')

#spam_words_length.plot(bins=100, kind='hist',label = 'Spam')

8
[Link]('Distribution of Number of Words')
[Link]('Number of Words')
[Link]()

[Link]()

[14]: def mean_word_length(x):

word_lengths = [Link]([])
for word in word_tokenize(x):
word_lengths = [Link](word_lengths, len(word))
return word_lengths.mean()

ham_meanword_length = data_frame[data_frame['spam']==0].text.
↪apply(mean_word_length)

spam_meanword_length = data_frame[data_frame['spam']==1].text.
↪apply(mean_word_length)

[Link](ham_meanword_length, norm_hist = True, bins = 30, label = 'Ham')

9
[Link](spam_meanword_length , norm_hist = True, bins = 30, label = 'Spam')
[Link]('Distribution of Mean Word Length')
[Link]('Mean Word Length')
[Link]()
[Link]()

#There is not a significant difference for the length of words used by ham and␣
↪spam emails

[15]: #Checking ratio of stop words

#Both spam and ham email contain stopwords
#All Spam emails contain stop words with a mean of 0.281
#All Ham emails contain stop words with a mean of 0.278
#But we can see from the graph, spam email contain high stop words ratio as␣
↪compared to ham emails.

from [Link] import stopwords

stop_words = set([Link]('english'))

def stop_words_ratio(x):
num_total_words = 0

10
num_stop_words = 0
for word in word_tokenize(x):
if word in stop_words:
num_stop_words += 1
num_total_words += 1
return num_stop_words/num_total_words

ham_stopwords = data_frame[data_frame['spam']==0].[Link](stop_words_ratio)
spam_stopwords = data_frame[data_frame['spam']==1].[Link](stop_words_ratio)

[Link](ham_stopwords, norm_hist = True, label = 'Ham')

[Link](spam_stopwords, label = 'Spam')

print('Ham Mean: {:.3f}'.format(ham_stopwords.[Link]()))

print('Spam Mean: {:.3f}'.format(spam_stopwords.[Link]()))
[Link]('Distribution of Stop-word Ratio')
[Link]('Stop Word Ratio')
[Link]()

Ham Mean: 0.278

Spam Mean: 0.281

[15]: <[Link] at 0x2e15f4d2848>

11
[16]: spam_stopwords

[16]: 0 0.230769
1 0.277778
2 0.397727
3 0.191919
4 0.396226
…
1363 0.342105
1364 0.365854
1365 0.437500
1366 0.446809
1367 0.320024
Name: text, Length: 1368, dtype: float64

[33]: # Let's divide the messages into spam and ham

ham = data_frame[data_frame['spam']==0]
spam = data_frame[data_frame['spam']==1]
spam['length'].plot(bins=60, kind='hist')
ham['length'].plot(bins=60, kind='hist')
data_frame['Ham(0) and Spam(1)'] = data_frame['spam']

12
print( 'Spam percentage =', (len(spam) / len(data_frame) )*100,"%")
print( 'Ham percentage =', (len(ham) / len(data_frame) )*100,"%")
[Link](data_frame['Ham(0) and Spam(1)'], label = "Count")

#word_cloud_obj = generate_word_cloud()
#word_cloud_obj.word_cloud(ham["clean_text"], "ham_word_cloud.png")
#word_cloud_obj.word_cloud(spam["clean_text"], "spam_word_cloud.png")
#text_spam = " ".join(review for review in spam["clean_text"])

Spam percentage = 23.88268156424581 %

Ham percentage = 76.11731843575419 %

[33]: <AxesSubplot:xlabel='Ham(0) and Spam(1)', ylabel='count'>

[18]: word_cloud_obj = generate_word_cloud()

word_cloud_obj.word_cloud(ham["text"], "ham_word_cloud.png")
word_cloud_obj.word_cloud(spam["text"], "spam_word_cloud.png")

13
14
[19]: data_clean_obj = data_cleaning()
# Let's test the newly added function
#data_frame['clean_text'] = data_frame['text'].apply(message_cleaning)
#data_frame['clean_text'] = data_frame['text'].apply(data_clean_obj.
↪message_cleaning)

data_frame['clean_text'] = data_clean_obj.apply_to_column(data_frame['text'])

[20]: data_frame.head()

[20]: text spam length \

0 Subject: naturally irresistible your corporate… 1 1484
1 Subject: the stock trading gunslinger fanny i… 1 598
2 Subject: unbelievable new homes made easy im … 1 448
3 Subject: 4 color printing special request add… 1 500
4 Subject: do not have money , get software cds … 1 235

clean_text
0 Subject naturally irresistible corporate ident…
1 Subject stock trading gunslinger fanny merrill…
2 Subject unbelievable new homes made easy im wa…
3 Subject 4 color printing special request addit…
4 Subject money get software cds software compat…

[21]: data_obj.data_frame.head()

[21]: text spam length \

[22]: data_obj.write_to_csvfile("processed_file.csv")

[23]: #APPLY COUNT VECTORIZER TO OUR MESSAGES LIST

# Define the cleaning pipeline we defined earlier

#vectorizer = CountVectorizer()
cv_object = apply_embeddding_and_model()
spamham_countvectorizer = cv_object.apply_count_vector(data_frame['clean_text'])

15
[24]: #Separating Descriptive and Target Feature
X = spamham_countvectorizer
label = data_frame['spam'].values
y = label

[25]: cv_object.apply_naive_bayes(X,y)

precision recall f1-score support

0 1.00 0.99 0.99 901

1 0.98 0.99 0.98 245

accuracy 0.99 1146

macro avg 0.99 0.99 0.99 1146
weighted avg 0.99 0.99 0.99 1146

test set

Accuracy Score: 0.9921465968586387

F1 Score: 0.9817444219066936
Recall: 0.9877551020408163
Precision: 0.9758064516129032
Confusion matrix, without normalization
[[895 6]
[ 3 242]]
Normalized confusion matrix
[[0.99334073 0.00665927]
[0.0122449 0.9877551 ]]

16
17
No Skill: ROC AUC=0.500
Naive Bayes: ROC AUC=0.998

18
[26]: cv_object.apply_svm(X,y)

precision recall f1-score support

0 0.99 0.99 0.99 901

1 0.98 0.98 0.98 245

accuracy 0.99 1146

macro avg 0.99 0.98 0.99 1146
weighted avg 0.99 0.99 0.99 1146

test set

Accuracy Score: 0.9904013961605584

F1 Score: 0.9775051124744377
Recall: 0.9755102040816327
Precision: 0.9795081967213115
Confusion matrix, without normalization
[[896 5]
[ 6 239]]
Normalized confusion matrix
[[0.99445061 0.00554939]

19
[0.0244898 0.9755102 ]]

20
No Skill: ROC AUC=0.500
SVM: ROC AUC=0.998

21
[ ]:

[ ]:

Python CA 4
No ratings yet
Python CA 4
9 pages
Spam Detection with NLP Techniques
No ratings yet
Spam Detection with NLP Techniques
6 pages
Intelligent Cyber Security Lab Exercises
No ratings yet
Intelligent Cyber Security Lab Exercises
49 pages
Email Spam Detection with SVM
No ratings yet
Email Spam Detection with SVM
4 pages
SMS Spam Filter Implementation Guide
No ratings yet
SMS Spam Filter Implementation Guide
27 pages
Python Foundations for Generative AI
No ratings yet
Python Foundations for Generative AI
67 pages
Naïve Bayes Classifier Implementation
No ratings yet
Naïve Bayes Classifier Implementation
5 pages
MultiOutputClassifier for Text Classification
No ratings yet
MultiOutputClassifier for Text Classification
26 pages
SMS and Email Spam Classifier Analysis
No ratings yet
SMS and Email Spam Classifier Analysis
10 pages
Email Spam Detection with ML Techniques
No ratings yet
Email Spam Detection with ML Techniques
17 pages
AI-Powered SMS Spam Classifier Guide
No ratings yet
AI-Powered SMS Spam Classifier Guide
11 pages
SVM Spam Classifier Experiment Guide
No ratings yet
SVM Spam Classifier Experiment Guide
7 pages
Spam Detection with KNN and TF-IDF
No ratings yet
Spam Detection with KNN and TF-IDF
3 pages
AI Spam Classifier Using ELM & SVM
No ratings yet
AI Spam Classifier Using ELM & SVM
11 pages
NLP Implementation in Python Lab Report
No ratings yet
NLP Implementation in Python Lab Report
9 pages
AI Spam Detection Proposal Summary
No ratings yet
AI Spam Detection Proposal Summary
8 pages
Build Regression, Decision Trees, SVMs
No ratings yet
Build Regression, Decision Trees, SVMs
19 pages
Calculating Accuracy in Data Mining
No ratings yet
Calculating Accuracy in Data Mining
38 pages
Machine Learning Code Breakdown Guide
No ratings yet
Machine Learning Code Breakdown Guide
33 pages
S3 Data Processing and Classification
No ratings yet
S3 Data Processing and Classification
25 pages
Spam Detection with Naive Bayes Model
No ratings yet
Spam Detection with Naive Bayes Model
2 pages
Python Graphing Techniques Overview
No ratings yet
Python Graphing Techniques Overview
48 pages
Naïve Bayes Classifier Implementation
No ratings yet
Naïve Bayes Classifier Implementation
8 pages
Spam Filtering with Naive Bayes
No ratings yet
Spam Filtering with Naive Bayes
2 pages
Naive Bayes Email Spam Classification
No ratings yet
Naive Bayes Email Spam Classification
4 pages
Email Spam Classification Model Guide
No ratings yet
Email Spam Classification Model Guide
7 pages
Machine Learning Lab Manual 2021-22
No ratings yet
Machine Learning Lab Manual 2021-22
23 pages
Naive Bayes on 20 Newsgroups Data
No ratings yet
Naive Bayes on 20 Newsgroups Data
4 pages
PySpark Text Classification Pipeline
No ratings yet
PySpark Text Classification Pipeline
11 pages
Text Classification and Clustering Techniques
No ratings yet
Text Classification and Clustering Techniques
24 pages
SMS Spam Classification with ML
No ratings yet
SMS Spam Classification with ML
42 pages
Neural Network for Spam Detection
No ratings yet
Neural Network for Spam Detection
5 pages
Spam Detection with Python and NLP
No ratings yet
Spam Detection with Python and NLP
3 pages
Spam Email Classification with Naive Bayes
No ratings yet
Spam Email Classification with Naive Bayes
7 pages
Naïve Bayes and Random Forest Implementation
No ratings yet
Naïve Bayes and Random Forest Implementation
32 pages
Text Vectorization and Classification Lab
No ratings yet
Text Vectorization and Classification Lab
12 pages
Text Classification in Python Guide
No ratings yet
Text Classification in Python Guide
34 pages
AI Project: Toxic Comment Classification
No ratings yet
AI Project: Toxic Comment Classification
11 pages
Developing a Spam Filter with ML
No ratings yet
Developing a Spam Filter with ML
5 pages
Supervised Learning with Scikit-Learn
No ratings yet
Supervised Learning with Scikit-Learn
67 pages
Blue Doodle Project Overview
No ratings yet
Blue Doodle Project Overview
15 pages
Sentiment Analysis with Naive Bayes
No ratings yet
Sentiment Analysis with Naive Bayes
47 pages
NLP Sentiment Analysis with Python
No ratings yet
NLP Sentiment Analysis with Python
5 pages
NLP Tushar
No ratings yet
NLP Tushar
21 pages
Email Spam Filtering with ML
No ratings yet
Email Spam Filtering with ML
5 pages
Machine Learning: Feature Engineering & Models
No ratings yet
Machine Learning: Feature Engineering & Models
84 pages
Django Cyber Threat Prediction App
No ratings yet
Django Cyber Threat Prediction App
9 pages
SMS Spam Detection with Machine Learning
No ratings yet
SMS Spam Detection with Machine Learning
23 pages
TensorFlow Spam Detection in Python
No ratings yet
TensorFlow Spam Detection in Python
13 pages
SVM Analysis of Amazon Reviews Data
No ratings yet
SVM Analysis of Amazon Reviews Data
8 pages
AI Email Spam Filter Implementation
No ratings yet
AI Email Spam Filter Implementation
8 pages
Naive Bayes & SVM Classifiers in Python
No ratings yet
Naive Bayes & SVM Classifiers in Python
4 pages
Email Spam Classifier with Random Forest
No ratings yet
Email Spam Classifier with Random Forest
4 pages
Sample
No ratings yet
Sample
6 pages
Logistic Regression on Amazon Reviews
No ratings yet
Logistic Regression on Amazon Reviews
11 pages
Machine Learning Program Examples
No ratings yet
Machine Learning Program Examples
64 pages
Sentiment Analysis with Python Code
No ratings yet
Sentiment Analysis with Python Code
7 pages
Top News Categories Analysis and Model
No ratings yet
Top News Categories Analysis and Model
4 pages
Movie Review Sentiment Analysis NLP
No ratings yet
Movie Review Sentiment Analysis NLP
10 pages
How To Fill in The Logbook: © Architects Accreditation Council of Australia 2021
No ratings yet
How To Fill in The Logbook: © Architects Accreditation Council of Australia 2021
8 pages
1.3 Instrument Terminology and Performance : B. G. Lipták
No ratings yet
1.3 Instrument Terminology and Performance : B. G. Lipták
32 pages
MBA Distance Programs at Shivaji University
No ratings yet
MBA Distance Programs at Shivaji University
32 pages
Understanding the Sharing Economy
No ratings yet
Understanding the Sharing Economy
3 pages
MAPEH 6 First Quarter Test Guide
No ratings yet
MAPEH 6 First Quarter Test Guide
7 pages
Digitrans R.F. Probe Calibration Guide
No ratings yet
Digitrans R.F. Probe Calibration Guide
16 pages
NV4000 Range User Manual-V1
No ratings yet
NV4000 Range User Manual-V1
76 pages
Arp 194: Merging Galaxy Dynamics
No ratings yet
Arp 194: Merging Galaxy Dynamics
3 pages
Technology and Innovation Strategy Syllabus
No ratings yet
Technology and Innovation Strategy Syllabus
19 pages
Dragon Puppet Design Inspirations
No ratings yet
Dragon Puppet Design Inspirations
1 page
Step-by-Step SAP Router Setup on Windows
No ratings yet
Step-by-Step SAP Router Setup on Windows
13 pages
Manual de Uso Controlador M168 Friotec
No ratings yet
Manual de Uso Controlador M168 Friotec
106 pages
Www.techsofttamil.com
No ratings yet
Www.techsofttamil.com
5 pages
MADM System Configuration Overview
100% (1)
MADM System Configuration Overview
44 pages
C Functions: Types and Examples
No ratings yet
C Functions: Types and Examples
11 pages
Overview of Alpha Testing in Software
No ratings yet
Overview of Alpha Testing in Software
8 pages
Python File Handling Modes Explained
No ratings yet
Python File Handling Modes Explained
3 pages
Etap 16 Install Guide
100% (1)
Etap 16 Install Guide
4 pages
Digital Technology in 21st Century Higher Education
No ratings yet
Digital Technology in 21st Century Higher Education
7 pages
Aditya Infotech Q3FY26 concall
No ratings yet
Aditya Infotech Q3FY26 concall
16 pages
Introduction to Fiber Optic Communication
No ratings yet
Introduction to Fiber Optic Communication
28 pages
Histogram Guide: Definition & Excel Steps
No ratings yet
Histogram Guide: Definition & Excel Steps
19 pages
PT9 C-Proof Beacon Label Update
No ratings yet
PT9 C-Proof Beacon Label Update
1 page
SCR Power Control Techniques
No ratings yet
SCR Power Control Techniques
40 pages
Key Risk Indicators for IT Security
No ratings yet
Key Risk Indicators for IT Security
48 pages
Filipino Entrepreneurs in ICT & OSH Standards
No ratings yet
Filipino Entrepreneurs in ICT & OSH Standards
28 pages
Harith Danish's Educational Profile
No ratings yet
Harith Danish's Educational Profile
2 pages
Computer System Organization Overview
No ratings yet
Computer System Organization Overview
34 pages
Geyser Beading Pattern Guide
100% (2)
Geyser Beading Pattern Guide
11 pages
Infrared Photography Conversion Guide
No ratings yet
Infrared Photography Conversion Guide
112 pages

Email Spam Classification System

Uploaded by

Email Spam Classification System

Uploaded by

email-spam-classifier

[1]: import pandas as pd

[2]: #Parent Class for Data

[3]: #Child Class for Data_read_write

[4]: #Child Class for Data_read_write

final_join = ' '.join(Test_punc_removed_join_clean)

def apply_to_column(self, data_column_text):

[5]: #Child Class for Data_read_write

def apply_naive_bayes(self, X, y):

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

class_names = ['ham', 'spam']

# generate a no skill prediction (majority class)

print("\nAccuracy Score: " + str(metrics.accuracy_score(y_test,␣

class_names = ['ham', 'spam']

# generate a no skill prediction (majority class)

[6]: data_obj = data_read_write("[Link]")

[7]: data_frame = data_obj.read_csv_file("[Link]")

[8]: text spam

[9]: #Visualize dataset

[10]: # Let's get the length of the messages

[11]: #data_frame['length'].plot(bins=100, kind='hist')

ham_messages_length['length'].plot(bins=100, kind='hist',label = 'Ham')

#[Link](spam_messages_length['length'], bins=10, norm_hist = True, label␣

[Link]('Distribution of Length of Email Text')

#ax = [Link](ham_words_length, norm_hist = True, bins = 30, label = 'Ham')

[11]: <[Link] at 0x2e158719f88>

ham_words_length = [len(word_tokenize(title)) for title in␣

spam_words_length = [len(word_tokenize(title)) for title in␣

[13]: #There is spike in spam emails with less number of words

#ham_words_length.plot(bins=100, kind='hist',label = 'Ham')

[14]: def mean_word_length(x):

[Link](ham_meanword_length, norm_hist = True, bins = 30, label = 'Ham')

[15]: #Checking ratio of stop words

from [Link] import stopwords

[Link](ham_stopwords, norm_hist = True, label = 'Ham')

print('Ham Mean: {:.3f}'.format(ham_stopwords.[Link]()))

Ham Mean: 0.278

[15]: <[Link] at 0x2e15f4d2848>

[33]: # Let's divide the messages into spam and ham

Spam percentage = 23.88268156424581 %

[33]: <AxesSubplot:xlabel='Ham(0) and Spam(1)', ylabel='count'>

[18]: word_cloud_obj = generate_word_cloud()

[20]: text spam length \

[21]: text spam length \

[23]: #APPLY COUNT VECTORIZER TO OUR MESSAGES LIST

# Define the cleaning pipeline we defined earlier

precision recall f1-score support

0 1.00 0.99 0.99 901

accuracy 0.99 1146

Accuracy Score: 0.9921465968586387

precision recall f1-score support

0 0.99 0.99 0.99 901

accuracy 0.99 1146

Accuracy Score: 0.9904013961605584

You might also like