0% found this document useful (0 votes)
33 views9 pages

Python CA 4

This document summarizes a student's Python project on SMS spam detection. The student loads and cleans a SMS text dataset, applies natural language processing techniques like tokenization and stemming, then builds and compares various classification models including Naive Bayes, Logistic Regression, Random Forest and XGBoost. The best performing models are then ensemble using voting and stacking classifiers to further improve accuracy and precision of spam detection. The models are saved using pickle for future use.

Uploaded by

subham patra
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
33 views9 pages

Python CA 4

This document summarizes a student's Python project on SMS spam detection. The student loads and cleans a SMS text dataset, applies natural language processing techniques like tokenization and stemming, then builds and compares various classification models including Naive Bayes, Logistic Regression, Random Forest and XGBoost. The best performing models are then ensemble using voting and stacking classifiers to further improve accuracy and precision of spam detection. The models are saved using pickle for future use.

Uploaded by

subham patra
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

Python CA 4

Name : Subham Patra

REG NO : 12215814

# SMS SPAM DETECTION

import numpy as np

import pandas as pd

import warnings

[Link]('ignore')

df = pd.read_csv('[Link]',encoding='latin1')

[Link](5)

[Link]

## Data Cleaning

[Link]()

# drop last 3 columns

[Link](columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

df

# rename columns

[Link](columns={'v1':'target','v2':'text'},inplace=True)

[Link]()

# change target into binary

from [Link] import LabelEncoder as LE

encoder=LE()

df['target']=encoder.fit_transform(df['target'])

[Link]()

# null values

[Link]().sum()
#check duplicates

[Link]().sum()

#drop duplicates

df=df.drop_duplicates(keep='first')

[Link]().sum()

[Link]

# EDA

df['target'].value_counts()

import [Link] as plt

[Link](df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")

[Link]()

#making new columns(no. of chars,words and sentences) for better analyzing

import nltk

!pip install nltk

[Link]('punkt')

df['num_chars']=df['text'].apply(len)

[Link](3)

#num of words

df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

[Link]()

df[['num_chars','num_words','num_sentences']].describe()

#hams

df[df['target']==0][['num_chars','num_words','num_sentences']].describe()

#spams

df[df['target']==1][['num_chars','num_words','num_sentences']].describe()

import seaborn as sns

[Link](df[df['target']==0]['num_chars'])

[Link](df[df['target']==1]['num_chars'],color='red')

[Link](df[df['target']==0]['num_words'])
[Link](df[df['target']==1]['num_words'],color='red')

[Link](df,hue='target')

[Link]([Link](),annot=True)

# Data Preprocessing

### Lower case

### Tokenization

### Removing special characters

### Removing stop words and punctuation

### Stemming

import nltk

from [Link] import stopwords

[Link]('stopwords')

stopwords=[Link]('english')

import string

puncs=[Link]

from [Link] import PorterStemmer

ps=PorterStemmer()

def transform_text(text):

text=[Link]()

text=nltk.word_tokenize(text)

y=[]

for i in text:

if [Link]():

[Link](i)

text=y[:]

[Link]()

for i in text:
if i not in stopwords+list(puncs):

[Link](i)

text=y[:]

[Link]()

for i in text:

[Link]([Link](i))

return " ".join(y)

df['transformed_text']=df['text'].apply(transform_text)

[Link](5)

# !pip install wordcloud

from wordcloud import WordCloud

wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')

spam_wc=[Link](df[df['target']==1]['transformed_text'].[Link](sep=" "))

# [Link](figsize=(15,6))

[Link](spam_wc)

ham_wc=[Link](df[df['target']==0]['transformed_text'].[Link](sep=" "))

# [Link](figsize=(15,6))

[Link](ham_wc)

spam_words=[]

for msg in df[df['target']==1]['transformed_text'].tolist():

for word in [Link]():

spam_words.append(word)

len(spam_words)

from collections import Counter

[Link]([Link](Counter(spam_words).most_common(30))[0],[Link](Counter(spam_w
ords).most_common(30))[1])

[Link](rotation='vertical')

[Link]()
ham_words=[]

for msg in df[df['target']==0]['transformed_text'].tolist():

for word in [Link]():

ham_words.append(word)

len(ham_words)

from collections import Counter

[Link]([Link](Counter(ham_words).most_common(30))[0],[Link](Counter(ham_wor
ds).most_common(30))[1])

[Link](rotation='vertical')

[Link]()

# MODEL BUILDING->naive bayes start

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

cv = CountVectorizer()

tfidf = TfidfVectorizer(max_features=3000)

X = tfidf.fit_transform(df['transformed_text']).toarray()

[Link]

y = df['target'].values

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

from [Link] import accuracy_score,confusion_matrix,precision_score

gnb = GaussianNB()

mnb = MultinomialNB()

bnb = BernoulliNB()

[Link](X_train,y_train)
y_pred1 = [Link](X_test)

print(accuracy_score(y_test,y_pred1))

print(confusion_matrix(y_test,y_pred1))

print(precision_score(y_test,y_pred1))

[Link](X_train,y_train)

y_pred2 = [Link](X_test)

print(accuracy_score(y_test,y_pred2))

print(confusion_matrix(y_test,y_pred2))

print(precision_score(y_test,y_pred2))

[Link](X_train,y_train)

y_pred3 = [Link](X_test)

print(accuracy_score(y_test,y_pred3))

print(confusion_matrix(y_test,y_pred3))

print(precision_score(y_test,y_pred3))

from sklearn.linear_model import LogisticRegression

from [Link] import SVC

from sklearn.naive_bayes import MultinomialNB

from [Link] import DecisionTreeClassifier

from [Link] import KNeighborsClassifier

from [Link] import RandomForestClassifier

from [Link] import AdaBoostClassifier

from [Link] import BaggingClassifier

from [Link] import ExtraTreesClassifier

from [Link] import GradientBoostingClassifier

!pip install xgboost

from xgboost import XGBClassifier

svc = SVC(kernel='sigmoid', gamma=1.0)

knc = KNeighborsClassifier()

mnb = MultinomialNB()

dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')

rfc = RandomForestClassifier(n_estimators=50, random_state=2)

abc = AdaBoostClassifier(n_estimators=50, random_state=2)

bc = BaggingClassifier(n_estimators=50, random_state=2)

etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

xgb = XGBClassifier(n_estimators=50,random_state=2)

def train_classifier(clf,X_train,y_train,X_test,y_test):

[Link](X_train,y_train)

y_pred = [Link](X_test)

accuracy = accuracy_score(y_test,y_pred)

precision = precision_score(y_test,y_pred)

return accuracy,precision

train_classifier(svc,X_train,y_train,X_test,y_test)

clfs = {

'SVC' : svc,

'KN' : knc,

'NB': mnb,

'DT': dtc,

'LR': lrc,

'RF': rfc,

'AdaBoost': abc,

'BgC': bc,

'ETC': etc,

'GBDT':gbdt,

'xgb':xgb

# accuracy_scores = []

# precision_scores = []
# for name,clf in [Link]():

# current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)

# print("For ",name)

# print("Accuracy - ",current_accuracy)

# print("Precision - ",current_precision)

# accuracy_scores.append(current_accuracy)

# precision_scores.append(current_precision)

# performance_df =
[Link]({'Algorithm':[Link](),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_
values('Precision',ascending=False)

# performance_df

# performance_df1 = [Link](performance_df, id_vars = "Algorithm")

# performance_df1

# [Link](x = 'Algorithm', y='value', hue = 'variable',data=performance_df1, kind='bar',height=5)

# [Link](0.5,1.0)

# [Link](rotation='vertical')

# [Link]()

# model improve

# 1. Change the max_features parameter of TfIdf

temp_df =
[Link]({'Algorithm':[Link](),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3
000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)

temp_df =
[Link]({'Algorithm':[Link](),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precisio
n_scores}).sort_values('Precision_scaling',ascending=False)

new_df = performance_df.merge(temp_df,on='Algorithm')

new_df_scaled = new_df.merge(temp_df,on='Algorithm')
temp_df =
[Link]({'Algorithm':[Link](),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':
precision_scores}).sort_values('Precision_num_chars',ascending=False)

# new_df_scaled.merge(temp_df,on='Algorithm')

# Voting Classifier

svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)

mnb = MultinomialNB()

etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from [Link] import VotingClassifier

voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')

[Link](X_train,y_train)

y_pred = [Link](X_test)

print("Accuracy",accuracy_score(y_test,y_pred))

print("Precision",precision_score(y_test,y_pred))

# Applying stacking

estimators=[('svm', svc), ('nb', mnb), ('et', etc)]

final_estimator=RandomForestClassifier()

# from [Link] import StackingClassifier

# clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

# [Link](X_train,y_train)

# y_pred = [Link](X_test)

# print("Accuracy",accuracy_score(y_test,y_pred))

# print("Precision",precision_score(y_test,y_pred))

import pickle

[Link](tfidf,open('[Link]','wb'))

[Link](mnb,open('[Link]','wb'))

You might also like