-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
111 lines (95 loc) · 3.54 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
import pandas as pd
from math import *
import tqdm
import random
import os
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
import Ville.setting
from Ville.train import Set_fold
# set all seed
def SEED_Everything():
random.seed(Ville.setting.SEED)
os.environ['PYTHONHASHSEED'] = str(Ville.setting.SEED)
np.random.seed(Ville.setting.SEED)
# 二点間の距離を計算
def LatLng_to_xyz(lat, lng):
rlat, rlng = radians(lat), radians(lng)
coslat = cos(rlat)
return coslat*cos(rlng), coslat*sin(rlng), sin(rlat)
def Dist_On_Sphere(pos0_lat, pos0_lng, pos1_lat, pos1_lng, radius=6378.137):
xyz0, xyz1 = LatLng_to_xyz(pos0_lat, pos0_lng), LatLng_to_xyz(pos1_lat, pos1_lng)
return acos(sum(x * y for x, y in zip(xyz0, xyz1)))*radius
# 集計をとる
def MakeGroupstat(df, categorical_features=None, quant_features=None, report=True):
for add_cat_col in tqdm(categorical_features):
for add_qua_col in quant_features:
for typ in ['mean', 'max', 'min', 'std']:
df[add_cat_col +'_'+ typ +'_'+ add_qua_col] = df.groupby([add_cat_col])[add_qua_col].transform(typ)
return df
# LabeEncoder
def LabelEncoding_separate(train, test):
cat = []
for f in train.columns:
if train[f].dtype == 'object' or test[f].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(train[f].values) + list(test[f].values))
train[f] = lbl.transform(list(train[f].values))
test[f] = lbl.transform(list(test[f].values))
cat.append(f)
return train, test, cat
def LabelEncoding_total(total):
cat = []
for f in total.columns:
if total[f].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(total[f].values))
total[f] = lbl.transform(list(total[f].values))
cat.append(f)
return total, cat
# target encoding
def TargetEncoding(train, test, target, cat_cols, num_folds=5):
for c in cat_cols:
data_tmp = pd.DataFrame({c: train[c], 'target': target})
target_mean = data_tmp.groupby(c)['target'].mean()
test[c] = test[c].map(target_mean)
tmp = np.repeat(np.nan, train.shape[0])
kf = Set_fold('kfold', num_folds)
for idx_1, idx_2 in kf.split(train):
target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
tmp[idx_2] = train[c].iloc[idx_2].map(target_mean)
train[c] = tmp
return train, test
def StandardScale(df):
scaler = StandardScaler()
scaler.fit(df)
df = scaler.transform(df).astype(np.float32)
return df
def Calculation_Feature(df, col1, col2):
df[col1 + '+' + col2] = df[col1] + df[col2]
df[col1 + '-' + col2] = df[col1] - df[col2]
df[col1 + '*' + col2] = df[col1] * df[col2]
df[col1 + '/' + col2] = df[col1] / df[col2]
return df
def Imputation(df, method):
if method == 'mean':
df = df.fillna(df.mean())
elif method == 'median':
df = df.fillna(df.median())
elif method == 'mode':
df = df.fillna(df.mode().iloc[0])
return df
def Get_Outlier_sigma(df, col, num=2):
sigma = df[col].mean() + df[col].std() * num
df[col + '_outlier'] = 0
df[col + '_outlier'][df[col].map(lambda x: x >= sigma)] = 1
return df
def Get_Outlier_value_upper(df, col, v):
df[col + '_outlier'] = 0
df[col + '_outlier'][df[col].map(lambda x: x >= v)] = 1
return df
def Get_Outlier_value_bottom(df, col, v):
df[col + '_outlier'] = 0
df[col + '_outlier'][df[col].map(lambda x: x <= v)] = 1
return df