Python CA 4
Name : Subham Patra
REG NO : 12215814
# SMS SPAM DETECTION
import numpy as np
import pandas as pd
import warnings
[Link]('ignore')
df = pd.read_csv('[Link]',encoding='latin1')
[Link](5)
[Link]
## Data Cleaning
[Link]()
# drop last 3 columns
[Link](columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df
# rename columns
[Link](columns={'v1':'target','v2':'text'},inplace=True)
[Link]()
# change target into binary
from [Link] import LabelEncoder as LE
encoder=LE()
df['target']=encoder.fit_transform(df['target'])
[Link]()
# null values
[Link]().sum()
#check duplicates
[Link]().sum()
#drop duplicates
df=df.drop_duplicates(keep='first')
[Link]().sum()
[Link]
# EDA
df['target'].value_counts()
import [Link] as plt
[Link](df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
[Link]()
#making new columns(no. of chars,words and sentences) for better analyzing
import nltk
!pip install nltk
[Link]('punkt')
df['num_chars']=df['text'].apply(len)
[Link](3)
#num of words
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
[Link]()
df[['num_chars','num_words','num_sentences']].describe()
#hams
df[df['target']==0][['num_chars','num_words','num_sentences']].describe()
#spams
df[df['target']==1][['num_chars','num_words','num_sentences']].describe()
import seaborn as sns
[Link](df[df['target']==0]['num_chars'])
[Link](df[df['target']==1]['num_chars'],color='red')
[Link](df[df['target']==0]['num_words'])
[Link](df[df['target']==1]['num_words'],color='red')
[Link](df,hue='target')
[Link]([Link](),annot=True)
# Data Preprocessing
### Lower case
### Tokenization
### Removing special characters
### Removing stop words and punctuation
### Stemming
import nltk
from [Link] import stopwords
[Link]('stopwords')
stopwords=[Link]('english')
import string
puncs=[Link]
from [Link] import PorterStemmer
ps=PorterStemmer()
def transform_text(text):
text=[Link]()
text=nltk.word_tokenize(text)
y=[]
for i in text:
if [Link]():
[Link](i)
text=y[:]
[Link]()
for i in text:
if i not in stopwords+list(puncs):
[Link](i)
text=y[:]
[Link]()
for i in text:
[Link]([Link](i))
return " ".join(y)
df['transformed_text']=df['text'].apply(transform_text)
[Link](5)
# !pip install wordcloud
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
spam_wc=[Link](df[df['target']==1]['transformed_text'].[Link](sep=" "))
# [Link](figsize=(15,6))
[Link](spam_wc)
ham_wc=[Link](df[df['target']==0]['transformed_text'].[Link](sep=" "))
# [Link](figsize=(15,6))
[Link](ham_wc)
spam_words=[]
for msg in df[df['target']==1]['transformed_text'].tolist():
for word in [Link]():
spam_words.append(word)
len(spam_words)
from collections import Counter
[Link]([Link](Counter(spam_words).most_common(30))[0],[Link](Counter(spam_w
ords).most_common(30))[1])
[Link](rotation='vertical')
[Link]()
ham_words=[]
for msg in df[df['target']==0]['transformed_text'].tolist():
for word in [Link]():
ham_words.append(word)
len(ham_words)
from collections import Counter
[Link]([Link](Counter(ham_words).most_common(30))[0],[Link](Counter(ham_wor
ds).most_common(30))[1])
[Link](rotation='vertical')
[Link]()
# MODEL BUILDING->naive bayes start
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
[Link]
y = df['target'].values
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from [Link] import accuracy_score,confusion_matrix,precision_score
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
[Link](X_train,y_train)
y_pred1 = [Link](X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))
[Link](X_train,y_train)
y_pred2 = [Link](X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))
[Link](X_train,y_train)
y_pred3 = [Link](X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))
from sklearn.linear_model import LogisticRegression
from [Link] import SVC
from sklearn.naive_bayes import MultinomialNB
from [Link] import DecisionTreeClassifier
from [Link] import KNeighborsClassifier
from [Link] import RandomForestClassifier
from [Link] import AdaBoostClassifier
from [Link] import BaggingClassifier
from [Link] import ExtraTreesClassifier
from [Link] import GradientBoostingClassifier
!pip install xgboost
from xgboost import XGBClassifier
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)
def train_classifier(clf,X_train,y_train,X_test,y_test):
[Link](X_train,y_train)
y_pred = [Link](X_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
return accuracy,precision
train_classifier(svc,X_train,y_train,X_test,y_test)
clfs = {
'SVC' : svc,
'KN' : knc,
'NB': mnb,
'DT': dtc,
'LR': lrc,
'RF': rfc,
'AdaBoost': abc,
'BgC': bc,
'ETC': etc,
'GBDT':gbdt,
'xgb':xgb
# accuracy_scores = []
# precision_scores = []
# for name,clf in [Link]():
# current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
# print("For ",name)
# print("Accuracy - ",current_accuracy)
# print("Precision - ",current_precision)
# accuracy_scores.append(current_accuracy)
# precision_scores.append(current_precision)
# performance_df =
[Link]({'Algorithm':[Link](),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_
values('Precision',ascending=False)
# performance_df
# performance_df1 = [Link](performance_df, id_vars = "Algorithm")
# performance_df1
# [Link](x = 'Algorithm', y='value', hue = 'variable',data=performance_df1, kind='bar',height=5)
# [Link](0.5,1.0)
# [Link](rotation='vertical')
# [Link]()
# model improve
# 1. Change the max_features parameter of TfIdf
temp_df =
[Link]({'Algorithm':[Link](),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3
000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)
temp_df =
[Link]({'Algorithm':[Link](),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precisio
n_scores}).sort_values('Precision_scaling',ascending=False)
new_df = performance_df.merge(temp_df,on='Algorithm')
new_df_scaled = new_df.merge(temp_df,on='Algorithm')
temp_df =
[Link]({'Algorithm':[Link](),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':
precision_scores}).sort_values('Precision_num_chars',ascending=False)
# new_df_scaled.merge(temp_df,on='Algorithm')
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
from [Link] import VotingClassifier
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')
[Link](X_train,y_train)
y_pred = [Link](X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))
# Applying stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()
# from [Link] import StackingClassifier
# clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
# [Link](X_train,y_train)
# y_pred = [Link](X_test)
# print("Accuracy",accuracy_score(y_test,y_pred))
# print("Precision",precision_score(y_test,y_pred))
import pickle
[Link](tfidf,open('[Link]','wb'))
[Link](mnb,open('[Link]','wb'))