特徴量選択の方法

TANEO

特徴量選択の方法

後で手を加えやすいように
「標準化」→「特徴選択」→「次元圧縮」→「学習」
の流れで解析しやすいパイプラインを作ってみました。
その結果、

RFE > SelectFromModel
RandomForestClassifier > GradientBoostingClassifier

が特徴選択において有用という感じになったようです。
SVCの方面のパラメータ調整は厳密には行なっていないですが、解き方の一例として進捗を共有します。

また、どうやらこのスクリプトから出る交差検証における平均のAccuracyよりも、テストデータでのAccuracyの方が2%も高かったのが気になります... 意外とアテにならないものですね。

このような解析における汎化性能の評価方法、知ってる方がいれば共有願いたいです。

#!usr/bin/env python3
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA


def main():
    # パイプライン構築
    pipe = Pipeline([('preprocessing', None), ('feature_selection', None),
                     ("pca", None), ('classifier', None)])

    # パラメータ・モデルの設定
    param_grid = [
        {
            'classifier': [SVC()],
            'preprocessing':[StandardScaler()],
            'feature_selection': [
                RFE(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    n_features_to_select=14),
                SelectFromModel(GradientBoostingClassifier(
                    learning_rate=0.01, min_samples_split=500, min_samples_leaf=17, max_depth=8,
                    max_features=0.3, subsample=0.8, random_state=10, n_estimators=100),
                    threshold="median"),
                RFE(RFC(n_estimators=20), n_features_to_select=14),
                SelectFromModel(RFC(n_estimators=20), threshold="median"),
            ],
            'classifier__gamma':[0.4],
            'classifier__C':[2],
            'pca': [PCA(n_components=0.8)]
        }
    ]

    gclf = GridSearchCV(pipe, param_grid, n_jobs=32, verbose=False, cv=10,
                        scoring='%s_weighted' % "f1")
    # データのロード
    X_train = np.loadtxt("train_data.csv", delimiter=",", skiprows=1,
                         usecols=[i for i in range(1, 24)])
    X_test = np.loadtxt("test_data.csv", delimiter=",", skiprows=1,
                        usecols=[i for i in range(1, 24)])
    y_train = np.loadtxt("train_data.csv", delimiter=",", skiprows=1,
                         usecols=(24))

    # GridSearchCVでハイパーパラメータを決定
    # KFoldでAccuracyを検証
    first_fold = True
    acc_ave = 0
    epoch = 0
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X_train, y_train):
        if first_fold:
            gclf.fit(X_train[train_index], y_train[train_index])
            clf = gclf.best_estimator_
            first_fold = False
            print(gclf.best_params_)
        clf.fit(X_train[train_index, ], y_train[train_index])

        acc = clf.score(X_train[test_index], y_train[test_index])
        acc_ave = acc_ave + acc
        epoch = epoch + 1

    print('Accuracy: {}'.format(acc_ave/epoch))

    # テストデータで予測
    pred = clf.predict(X_test)

    # csv形式に加工
    submit = pd.DataFrame(
        {"ID": [i for i in range(0, 3000)], "Y": pred.astype(int)})
    submit.to_csv("submit.csv", index=None)


if __name__ == "__main__":
    main()

# 参考文献
# https://www.haya-programming.com/entry/2018/02/22/234011#%E4%BA%A4%E5%B7%AE%E6%A4%9C%E8%A8%BC
# http://datanerd.hateblo.jp/entry/2017/09/15/160742

試しに回してみたら2位のSelectFromModel RandomForestClassifierがbestになりました。

>>> gclf.best_estimator_
Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('feature_selection', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impuri...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

>>> gclf.cv_results_["mean_test_score"]
array([0.78971058, 0.78879108, 0.79343282, 0.79362341])

この二つ（後ろ二つ）は結構拮抗していますね。

特徴量選択の方法

kazetof

TANEO

new user