-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes.py
79 lines (68 loc) · 2.7 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
import pickle
from project_utils import tokenize
path = "~/Google Drive/" #directory on emily's laptop
KAGGLE_TRAIN = pd.read_csv(path + "train.csv")
COMMENT = 'comment_text'
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
def split_data(KAGGLE_TRAIN):
'''
splits kaggle's train data into ratio of 60-20-20 train / dev / test
adds a 'none' label
there are a few NAs, replaces these with "unknown" which is usually a little
happier for sklearn models
'''
X = KAGGLE_TRAIN.iloc[:,:2]
y = KAGGLE_TRAIN.iloc[:,2:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
random_state=94110)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5,
random_state=94110)
train = pd.concat([X_train, y_train],axis=1)
dev = pd.concat([X_dev,y_dev],axis=1)
test = pd.concat([X_test,y_test],axis=1)
for d in [train, dev, test]:
#d['none'] = 1 - d[LABELS].max(axis=1)
d[COMMENT].fillna("unknown", inplace=True)
return train, dev, test
def createDocTermMatrices(train, dev, test):
'''
creates TFidf document term matrix from pandas dataframe where 2nd column
contains comments,
removes punctuation from sentences and converts to lowercase
'''
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
smooth_idf=1, sublinear_tf=1)
train_dtm = vec.fit_transform(train[COMMENT])
dev_dtm = vec.transform(dev[COMMENT])
test_dtm = vec.transform(test[COMMENT])
return train_dtm, dev_dtm, test_dtm
def model_one_label(dtm, label):
clf = MultinomialNB() #also test other versions and see which works best
return clf.fit(dtm, label)
def naive_bayes(dtm_train, dtm_test, y_train):
'''
return prediction probabilitities for each class
'''
pred_mat = np.zeros((dtm_test.shape[0], y_train.shape[1]))
for i, j in enumerate(LABELS):
print('fit',j)
mod = model_one_label(dtm_train, y_train[j])
pred_mat[:,i] = mod.predict_proba(dtm_test)[:,1]
return pred_mat
if __name__ == "__main__":
train, dev, test = split_data(KAGGLE_TRAIN)
print "creating doc term matrices..."
train_dtm, dev_dtm, test_dtm = createDocTermMatrices(train, dev, test)
print "successfully created doc term matrices"
y_train = train.iloc[:,2:]
y_dev = dev.iloc[:,2:]
print "running naive bayes model..."
preds = naive_bayes(train_dtm, dev_dtm, y_train)
auc = roc_auc_score(y_dev, preds)
print "auc-roc: " +str(auc) #dev AUC SCORE: 0.836970136328