-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPredict_location_RRS.py
129 lines (107 loc) · 5.12 KB
/
Predict_location_RRS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.utils.fixes import signature
df_csv = pd.read_csv('F:\RandomForest/for_RF.csv', delimiter=',')
df = df_csv.drop(['Long','Lat','event_date'],axis=1)
date = 'Aug18_21_18'
print(df_csv[df_csv['event_date']== date])
test_df = df_csv[df_csv['event_date']==date]
print(len(test_df))
train_df = df_csv[df_csv['event_date']!=date]
print(len(train_df))
test_indices = df_csv.index[df_csv['event_date']==date].tolist()
print(len(test_indices))
Test_Y = test_df['f_nf']
Test_X = test_df.drop(['Long','Lat','event_date','f_nf'], axis = 1)
print(Test_X)
f_train = train_df[train_df['f_nf']==1] #flood samples in training
nf_train = train_df[train_df['f_nf']==0] #non-flood samples in training
n = 3
NoSS = len(nf_train)/(n*len(f_train))
print(NoSS)
if round(NoSS)%2 == 0:
NoS = round(NoSS)-1
else:
NoS=round(NoSS)
# NoS = int(NoSS)
print(NoS)
prediction = []
for itr in range(0,11):
train_df_copy = train_df.drop(['Long', 'Lat', 'event_date'], axis=1)
for i in range(0, int(NoS)):
if i<(NoS-1):
print('train dataframe elements: ' + str(i))
Tr_F = train_df_copy[train_df_copy['f_nf'] == 1] # flood samples in training
nf_train_df = train_df_copy[train_df_copy['f_nf'] == 0] # non-flood samples in training
nf_train_indices = train_df_copy.index[train_df_copy['f_nf'] == 0].tolist() # non-flood sample indeces
print('nf_train_indices' + str(len(nf_train_indices)))
print(len(train_df_copy))
# pick random indices of nf samples so that number of nf samples equals number of f samples
nf_rand_ind = np.random.choice(nf_train_indices, size=n*len(Tr_F), replace=False).tolist()
# print len(nf_train_ind)
Tr_nF = df.iloc[nf_rand_ind] # get training nf dataframe for randomly selected indeces
# print(Tr_nF)
Train_df = pd.concat([Tr_F, Tr_nF]) # trainning dataset containg both f and nf samples
print(len(Train_df['f_nf']))
Train_X = Train_df.drop(['f_nf'], axis=1) # training features
Train_Y = Train_df['f_nf'] # training labels
train_df_copy = train_df_copy.drop(labels=nf_rand_ind) # remove indeces of nf samples used in this iteration
rf_clf = RandomForestClassifier(n_estimators=100, random_state=7)
rf_fit = rf_clf.fit(Train_X, Train_Y)
rf_predict = rf_fit.predict(Test_X)
elif i == (NoS-1):
print('Last train dataframe elements ' + str(i))
Tr_F = train_df_copy[train_df_copy['f_nf'] == 1]
Tr_nF = train_df_copy[train_df_copy['f_nf'] == 0]
# print (Tr_nF)
Train_df = pd.concat([Tr_F, Tr_nF])
print(len(Train_df['f_nf']))
Train_X = Train_df.drop(['f_nf'], axis=1)
Train_Y = Train_df['f_nf']
print('Last training set: '+str(len(Train_df)))
rf_clf = RandomForestClassifier(n_estimators=100, random_state=7, class_weight='balanced')
rf_fit = rf_clf.fit(Train_X, Train_Y)
rf_predict = rf_fit.predict(Test_X)
print('done')
prediction.append(rf_predict.tolist())
import statistics
majority =[]
for lng in range(0, len(prediction[0])):
lst2 = [item[lng] for item in prediction]
# print(len(lst2))
m = statistics.mode(lst2)
# print(m)
majority.append(m)
print(len(majority))
#Precision, Recall, False Positive and False Negative
print(confusion_matrix(Test_Y.tolist(), majority))
TN = confusion_matrix(Test_Y.tolist(), majority)[0][0]
FP = confusion_matrix(Test_Y.tolist(), majority)[0][1]
FN = confusion_matrix(Test_Y.tolist(), majority)[1][0]
TP = confusion_matrix(Test_Y.tolist(), majority)[1][1]
print(TP, TN, FP, FN)
FP_rate = FP * 100 / (TN + FP)
FN_rate = FN * 100 / (TP + FN)
print("FP rate " + str(FP_rate))
print("FN rate " + str(FN_rate))
print('non-flood precision: ' + str(precision_score(Test_Y.tolist(), majority, pos_label=0, average='binary')))
print('flood precision: ' + str(precision_score(Test_Y.tolist(), majority, pos_label=1, average='binary')))
print('non-flood recall: ' + str(recall_score(Test_Y.tolist(), majority, pos_label=0, average='binary')))
print('flood recall: ' + str(recall_score(Test_Y.tolist(), majority, pos_label=1, average='binary')))
g = np.savetxt('F:\RandomForest\Predict_location/%s_pred.csv'%(date),np.array(majority), delimiter=',', header='Prediction', comments='')
predict_df = pd.read_csv('F:\RandomForest\Predict_location/Aug18_21_18_pred.csv')
print(predict_df['Prediction'])
print(len(predict_df[predict_df['Prediction']==1]))
obs = df_csv.iloc[test_indices]
print(len(obs))
# print(len(pred))
obs.reset_index(drop=True, inplace=True)
predict_df.reset_index(drop=True, inplace=True)
final_df = pd.concat([obs,predict_df],axis=1)
final_df.to_csv('F:\RandomForest\Predict_location/%s_%s.csv'%(date,n), header=True)