forked from singh-shreya6/Spam-Detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspamdetect.py
103 lines (76 loc) · 3.25 KB
/
spamdetect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score,accuracy_score
#read data in CSV format according to your PC's address
data = pd.read_csv("C://Users//Shreya//Desktop//Sem 7//Group7A Assignment 2//spam.csv",encoding='latin') #Change here
data.rename(columns={'v1':'Class','v2':'Text'},inplace=True)
data['numClass'] = data['Class'].map({'ham':0, 'spam':1})
data['Count']=0
for i in np.arange(0,len(data.Text)):
data.loc[i,'Count'] = len(data.loc[i,'Text'])
# Unique values in target set
print("Unique values in the Class set: ", data.Class.unique())
ham = data[data.numClass == 0]
ham_count = pd.DataFrame(pd.value_counts(ham['Count'],sort=True).sort_index())
print("Number of ham messages in data set:", ham['Class'].count())
#print("Ham Count value", ham_count['Count'].count())
spam = data[data.numClass == 1]
spam_count = pd.DataFrame(pd.value_counts(spam['Count'],sort=True).sort_index())
print("Number of spam messages in data set:", spam['Class'].count())
#print("Spam Count value:", spam_count['Count'].count())
#Removing stopwords of English
stopset = set(stopwords.words("english"))
#Initialising Count Vectorizer
vectorizer = CountVectorizer(stop_words=stopset,binary=True)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.Text)
# Extract target column 'Class'
y = data.numClass
#Performing test train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, train_size=0.70, random_state=None)
# Show the results of the split
print("\n")
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))
print("\n")
objects = ('Multi-NB','SVM','KNN', 'RF', 'AdaBoost')
def train_classifier(clf, X_train, y_train):
clf.fit(X_train, y_train)
# function to predict features
def predict_labels(clf, features):
return(clf.predict(features))
# Initialize the five models
A = MultinomialNB(alpha=1.0,fit_prior=True)
B= LinearSVC()
C = KNeighborsClassifier(n_neighbors=1)
D = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=None)
E = AdaBoostClassifier(n_estimators=100)
clf = [A,B,C,D,E]
acc_score = [0,0,0,0,0]
for a in range(0,5):
print(objects[a])
train_classifier(clf[a], X_train, y_train)
y_pred = predict_labels(clf[a],X_test)
pred_val = f1_score(y_test, y_pred)
acc_score[a]=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print("Accuracy in %:")
print(acc_score[a]*100)
print("F1 Score")
print(pred_val)
print("\n")
y_pos = np.arange(len(objects))
y_val = [ x for x in acc_score]
plt.bar(y_pos,y_val, align='center', alpha=0.7)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy Score')
plt.title('Accuracy of Models')
plt.show()