In [3]:
from [Link] import drive
[Link]('/content/drive')
Mounted at /content/drive
In [4]:
import pandas as pd
In [5]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/[Link]')
In [6]:
x_data=[Link](['class','Unnamed: 0'],axis=1)
y_data=data['class']
In [7]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(x_data,y_data,random_state=42,stratify=y
_data,test_size=0.25)
X_train
Out[7]:
text preprocessed_text preprocessed_subject preprocessed_email preprocessed_data
From: silver ucs indiana edu
article writes ATF BURNS RANCH article writes
16428 wwarf@[Link] psuvm psu edu
article says from ... ETC ETC article says from ...
(Wayne J. W... psuvm psu...
erau edu philabs
From: drozinst@[Link] writes article writes article
10459 Ulf and all philips alpha erau edu
(Drozinski Tim)\nSu... writes fan an... writes fan an...
erau e...
Jokes and urartu sdpa org
From: dbd@[Link] article wrote article wrote
16656 International freenet carleton
(David Davidian)\nSu... bugunlerde je... bugunlerde je...
Relations Freenet carle...
From: Formal Rebuttal to hairball ecst csuchico
article amazing article amazing
17484 zippy@[Link] the Presumption of edu cae prds cdx mot
illustration dis... illustration dis...
(The Pi... Jurisdi... ca...
article Do we need the news intercon article
From: cmsph02@[Link] (Steven
11142 amanda_walker clipper for cheap intercon aol amanda_walker
Holton)\nSubject...
wrote one thin... security compuserve wrote one thin...
... ... ... ... ... ...
have read just have read just
From: nabil@[Link] (Nabil
16850 today two articles Dear Mr Ajami ariel yorku today two articles
Gangi)\nSubj...
dripping ... dripping ...
From: article writes troy bellcore news article writes
Welcome to Police
17873 bobh@[Link] critisism too easy cso uiuc edu uxa cso critisism too easy
State USA
(hettmansperge... wha... uiuc edu wha...
wrote when wrote when
From: revdak@[Link] (D. netcom central sun
15495 greeted said Marys assumption greeted said
Andrew Kille)\nSub... netcom
something t... something t...
From: article internet laosinh stgt sub org article internet
6551A and 6551
12102 gerrit@[Link] surfer writes does world std laosinh stgt surfer writes does
compatibility
(Gerrit Heit... any ... su... any ...
From: Clinton-
Presidents Press
18152 HQ@[Link] (The White white_house... Campaign92 Org white_house...
Conference 42393
Hou...
Hou...
text preprocessed_text preprocessed_subject preprocessed_email preprocessed_data
14121 rows × 5 columns
In [8]:
import tensorflow as tf
from [Link] import LabelEncoder
import numpy as np
#It is used to transform non-numerical labels to numerical labels
label_enc_train = LabelEncoder()
vec_train = label_enc_train.fit_transform(y_train)
#convert the vector of class to the matrix of binary class having 20 columns.
y_train_label = [Link].to_categorical(vec_train, 20)
label_enc_test = LabelEncoder()
vec_test = label_enc_test.fit_transform(y_test)
y_test_label = [Link].to_categorical(vec_test, 20)
In [9]:
from [Link] import Tokenizer
In [10]:
from [Link] import pad_sequences
In [11]:
!wget [Link]
--2023-03-27 [Link]-- [Link]
Resolving [Link] ([Link])... [Link]
Connecting to [Link] ([Link])|[Link]|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: [Link] [following]
--2023-03-27 [Link]-- [Link]
Connecting to [Link] ([Link])|[Link]|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: [Link] [following]
--2023-03-27 [Link]-- [Link]
Resolving [Link] ([Link])... [Link]
Connecting to [Link] ([Link])|[Link]|:443...
connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘[Link]’
[Link] 100%[===================>] 822.24M 5.00MB/s in 2m 43s
2023-03-27 [Link] (5.06 MB/s) - ‘[Link]’ saved [862182613/862182613]
In [12]:
!unzip glove*.zip
Archive: [Link]
inflating: [Link]
inflating: [Link]
inflating: [Link]
inflating: [Link]
In [13]:
class F1score ([Link]):
def __init__(self,x_test,y_test):
super(F1score,self).__init__()
self.x_test=x_test
self.y_test=y_test
[Link]={'auc': [],'F1_score':[]}
def on_epoch_end(self, epoch, logs={}):
f1=0
y_pred=[Link](self.x_test)
metrix1=[]
for val in y_pred:
y_val=[Link](val)
arr=[Link](20,dtype=int)
arr[y_val]=1
[Link](arr)
f1=f1_score(self.y_test,[Link](metrix1),average='micro')
print("The F1 score for this epoch is: ",f1)
In [14]:
import tensorflow as tf
from tensorflow import keras
from [Link] import layers
from [Link] import Dense,Input,Activation,BatchNormalization,Dropout,Emb
edding,LSTM,Flatten,Conv1D,MaxPooling1D
from [Link] import layers
from [Link] import l2
from [Link] import Model
import random as rn
from [Link] import roc_auc_score
from [Link] import f1_score
from [Link] import ModelCheckpoint ,TensorBoard,EarlyStopping,Learnin
gRateScheduler
from [Link] import sequence
from [Link] import concatenate
In [15]:
import datetime
Model 2
In [16]:
[Link](7)
tc = Tokenizer(num_words=None, char_level=True, oov_token='UNK',filters='!"#$%&()*+,-./:
;<=>?@[\\ ]^`{|}~\t\n')
tc.fit_on_texts(X_train['preprocessed_data'])
#In order to keep a special character just remove it from the string of filters. For exam
ple to keep "_" you can remove it from the above string of punctuations.
#oov_token is for some word or character that the fit vectorizer has not seen in the trai
ning data.
alphabet = "abcdefghijklmnopqrstuvwxyz"
char_dict = {}
for i, char in enumerate(alphabet):
char_dict[char] = i + 1
tc.word_index = char_dict.copy()
tc.word_index[tc.oov_token] = max(char_dict.values()) + 1
In [17]:
vocab_size=len(tc.word_index)+1
vocab_size
Out[17]:
28
In [18]:
sequencing_chardocs_train= tc.texts_to_sequences(X_train['preprocessed_data'])
sequencing_chardocs_test= tc.texts_to_sequences(X_test['preprocessed_data'])
#print(sequencing_chardocs_train[1])
In [19]:
padded_chardocs_train = pad_sequences(sequencing_chardocs_train, maxlen=10000, padding='
post')
padded_chardocs_test = pad_sequences(sequencing_chardocs_test, maxlen=10000, padding='pos
t')
In [20]:
from numpy import array
from numpy import asarray
from numpy import zeros
In [21]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('[Link]')
for line in f:
values = [Link]()
word = values[0]
coefs = asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
[Link]()
print('Loaded %s word vectors.' % len(embeddings_index))
Loaded 400000 word vectors.
In [22]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in tc.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
In [23]:
log_dir = "logs/fit/" + [Link]().strftime("%Y%m%d -%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
In [24]:
from [Link] import seed
seed(1)
[Link].set_seed(2)
[Link].clear_session()
inputs=Input(shape=(10000,),dtype='int32') #10000 which we have get 1014 as above
e = Embedding(vocab_size,100, weights=[embedding_matrix], input_length=10000, trainable=
False)(inputs)
x1=Conv1D(3,7,kernel_initializer='glorot_uniform',activation='relu')(e)
x2=Conv1D(3,8,kernel_initializer='glorot_uniform',activation='relu')(x1)
max_pool1=MaxPooling1D(3)(x2)
y1=Conv1D(3,9,kernel_initializer='glorot_uniform',activation='relu')(max_pool1)
y2=Conv1D(3,4,kernel_initializer='glorot_uniform',activation='relu')(y1)
max_pool2=MaxPooling1D(3)(y2)
flatten=Flatten()(max_pool2)
drop_out1=Dropout(0.5)(flatten)
dense_layer1=Dense(100, activation='relu')(drop_out1)
output_layer=Dense(20, activation='softmax')(dense_layer1)
optimizer1 = [Link](learning_rate=0.01, momentum=0.9)
earlystop = EarlyStopping(monitor='accuracy', min_delta=0.00, patience=2, verbose=1)
f1score=F1score(x_test=padded_chardocs_test,y_test=y_test_label)
model2 = Model(inputs=inputs,outputs=output_layer)
callback_list = [tensorboard_callback,earlystop,f1score]
[Link](optimizer=optimizer1, loss='categorical_crossentropy', metrics=['accuracy
'])
print([Link]())
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 10000)] 0
embedding (Embedding) (None, 10000, 100) 2800
conv1d (Conv1D) (None, 9994, 3) 2103
conv1d_1 (Conv1D) (None, 9987, 3) 75
max_pooling1d (MaxPooling1D (None, 3329, 3) 0
)
conv1d_2 (Conv1D) (None, 3321, 3) 84
conv1d_3 (Conv1D) (None, 3318, 3) 39
max_pooling1d_1 (MaxPooling (None, 1106, 3) 0
1D)
flatten (Flatten) (None, 3318) 0
dropout (Dropout) (None, 3318) 0
dense (Dense) (None, 100) 331900
dense_1 (Dense) (None, 20) 2020
=================================================================
Total params: 339,021
Trainable params: 336,221
Non-trainable params: 2,800
_________________________________________________________________
None
In [25]:
history2 = [Link](padded_chardocs_train, y_train_label, epochs=10,validation_data=(p
added_chardocs_test, y_test_label),verbose=1,callbacks=callback_list)
Epoch 1/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.08328022094752496
442/442 [==============================] - 21s 24ms/step - loss: 2.9589 - accuracy: 0.072
4 - val_loss: 2.9405 - val_accuracy: 0.0833
Epoch 2/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.08582961546632675
442/442 [==============================] - 10s 22ms/step - loss: 2.9288 - accuracy: 0.084
1 - val_loss: 2.9381 - val_accuracy: 0.0858
Epoch 3/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.08880390907159548
442/442 [==============================] - 10s 23ms/step - loss: 2.9176 - accuracy: 0.088
9 - val_loss: 2.9139 - val_accuracy: 0.0888
Epoch 4/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.0922031017633312
442/442 [==============================] - 10s 22ms/step - loss: 2.9080 - accuracy: 0.091
7 - val_loss: 2.9188 - val_accuracy: 0.0922
Epoch 5/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.09135330359039727
442/442 [==============================] - 10s 22ms/step - loss: 2.8906 - accuracy: 0.102
5 - val_loss: 2.8933 - val_accuracy: 0.0914
Epoch 6/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.0815806246016571
442/442 [==============================] - 10s 23ms/step - loss: 2.8717 - accuracy: 0.103
3 - val_loss: 2.9425 - val_accuracy: 0.0816
Epoch 7/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.09985128531973657
442/442 [==============================] - 10s 23ms/step - loss: 2.8499 - accuracy: 0.110
8 - val_loss: 2.8598 - val_accuracy: 0.0999
Epoch 8/10
148/148 [==============================] - 1s 8ms/step
The F1 score for this epoch is: 0.09772678988740174
442/442 [==============================] - 10s 22ms/step - loss: 2.8210 - accuracy: 0.111
8 - val_loss: 2.8454 - val_accuracy: 0.0977
Epoch 9/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.11089866156787763
442/442 [==============================] - 10s 22ms/step - loss: 2.7962 - accuracy: 0.123
6 - val_loss: 2.8333 - val_accuracy: 0.1109
Epoch 10/10
148/148 [==============================] - 1s 7ms/step
The F1 score for this epoch is: 0.1077119184193754
442/442 [==============================] - 10s 22ms/step - loss: 2.7557 - accuracy: 0.132
9 - val_loss: 2.8152 - val_accuracy: 0.1077
In [26]:
[Link]('model2weights.h5')
In [27]:
print("Accuracy->",([Link]['accuracy'][-1])*100)
Accuracy-> 13.292260468006134
In [28]:
[Link].plot_model(model2,to_file='[Link]',show_shapes=True,show_layer_names=T
rue,rankdir='TB',expand_nested=False,dpi=96)
Out[28]: