不均衡データにおけるSamplingについて
不均衡データの調整にSMOTEやUnderSampling, OverSamplingを使ってみました。
その結果、以下のような結果になりました。
$ python svm_new.py NONE
{'classifier__C': 0.4, 'classifier__gamma': 0.1}
Accuracy: 0.8134074074074074
[[20287 719]
[ 4319 1675]]
$ python svm_new.py UNDER
{'classifier__C': 0.4, 'classifier__gamma': 0.1}
Accuracy: 0.7540370370370371
[[17062 3944]
[ 2697 3297]]
$ python svm_new.py OVER
{'classifier__C': 0.4, 'classifier__gamma': 0.4}
Accuracy: 0.7635555555555555
[[17202 3804]
[ 2580 3414]]
$ python svm_new.py SMOTE
{'classifier__C': 0.4, 'classifier__gamma': 0.4}
Accuracy: 0.7822222222222223
[[17875 3131]
[ 2749 3245]]
精度こそ下がっているものの、混同行列を見れば分かる通り偽陰性率を下げていることが分かります。つまり、債務不履行にならない人に限った検出ならOverSamplingやSMOTEで少ない陽性のデータを水増しして、既存の検出方法と組み合わせることで正確性を上げられる可能性があります。
import numpy as np
import pandas as pd
import sys
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
def Sampling(X_train, y_train, mode):
positive_count_train = y_train.sum()
negative_count_train = len(y_train) - positive_count_train
if mode == "UNDER":
sample = RandomUnderSampler(
ratio={0: positive_count_train, 1: positive_count_train})
if mode == "OVER":
sample = RandomOverSampler(
ratio={0: negative_count_train, 1: negative_count_train})
if mode == "SMOTE":
sample = SMOTE(
ratio={0: negative_count_train, 1: negative_count_train})
if mode == "NONE":
sample = RandomUnderSampler(
ratio={0: negative_count_train, 1: positive_count_train})
X_train_resampled, y_train_resampled = sample.fit_sample(X_train, y_train)
return X_train_resampled, y_train_resampled
def main():
mode = sys.argv[1]
pipe = Pipeline([('preprocessing', StandardScaler()),
('feature_selection', SelectFromModel(
RFC(n_estimators=20), threshold="median")),
("pca", PCA(n_components=0.8)), ('classifier', SVC())])
param_grid = [
{
'classifier__gamma': [0.2],
'classifier__C':[0.4],
}
]
gclf = GridSearchCV(pipe, param_grid, cv=5, n_jobs=32,
verbose=False, scoring='roc_auc')
X_train = np.loadtxt("train_data.csv", delimiter=",", skiprows=1,
usecols=[i for i in range(1, 24)])
X_test = np.loadtxt("test_data.csv", delimiter=",", skiprows=1,
usecols=[i for i in range(1, 24)])
y_train = np.loadtxt("train_data.csv", delimiter=",", skiprows=1,
usecols=(24), dtype='int64')
acc_ave = 0
epoch = 0
kf = KFold(n_splits=5, shuffle=True)
X_train_resampled, y_train_resampled = Sampling(X_train, y_train, mode)
gclf.fit(X_train_resampled, y_train_resampled)
clf = gclf.best_estimator_
print(gclf.best_params_)
clf_list = []
accuracy_list = []
for train_index, test_index in kf.split(X_train, y_train):
X_train_resampled, y_train_resampled = Sampling(X_train[train_index],
y_train[train_index], mode)
clf.fit(X_train_resampled, y_train_resampled)
acc = clf.score(X_train[test_index], y_train[test_index])
acc_ave = acc_ave + acc
epoch = epoch + 1
clf_list.append(clf)
accuracy_list.append(acc)
clf = clf_list[accuracy_list.index(max(accuracy_list))]
score = clf.score(X_train, y_train)
print('Accuracy: {}'.format(score))
y_pred = clf.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
pred = clf.predict(X_test)
submit = pd.DataFrame(
{"ID": [i for i in range(0, 3000)], "Y": pred.astype(int)})
submit.to_csv("submit.csv", index=None)
if __name__ == "__main__":
main()