-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrf_baseline_sklearn.py
53 lines (42 loc) · 1.71 KB
/
rf_baseline_sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import numpy as np
import pandas as pd
from project_utils import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.sparse import hstack
RUN_CLASSES = range(6)
APPROACH = "ngram"
CLASSIFIER = "rf"
FLAVOR = "sklearn-SAG"
def fit_logistic_ngram_sklearn(train, dev, test):
"""Fits and evals ngram logistic regression model with sklearn package.
Args:
train: a pd.DataFrame of the training data.
dev: a pd.DataFrame of the dev data.
vocab:
Returns:
average ROC-AUC score over classes.
"""
train_vecs, dev_vecs, test_vecs = \
vectorize_corpus_tf_idf(train, dev, test, sparse=True)
# Doing one-vs-all training
auc_scores = []
for class_name in [CLASS_NAMES[x] for x in RUN_CLASSES]:
print('doing class {}'.format(class_name))
# Training model
train_target = train[class_name]
classifier = RandomForestClassifier(n_estimators=100,
n_jobs=4, random_state=RUN_SEED)
model = classifier.fit(train_vecs, train_target)
# Computing ROC
test_pred = model.predict_proba(test_vecs)
test_target = get_onehots_from_labels(test[class_name].values)
ROC_AUC_score = roc_auc_score(test_target, test_pred)
auc_scores.append(ROC_AUC_score)
print('--AUC score is {}'.format(ROC_AUC_score))
return auc_scores
if __name__ == '__main__':
train, dev, test = get_TDT_split(pd.read_csv('train.csv').fillna(' '))
auc_scores = fit_logistic_ngram_sklearn(train, dev, test)
save_auc_scores(auc_scores, APPROACH, CLASSIFIER, FLAVOR)
print('Avg ROC score is {}'.format(np.mean(auc_scores)))