Neural Networkによる解法 [LB: 0.831]

hrappuccino

Neural Networkによる解法 [LB: 0.831]

本文4000文字制限はつらい。コメントもらえると元気になります。

データ読み込み

import numpy as np
import pandas as pd

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

y = train['y']
train.drop(['y', 'id'], axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

df = pd.concat([train, test])

データ前処理

X1、X5、X12〜X17、X18〜X23は平均0、分散1に正規化する。

from sklearn.preprocessing import scale
credit = scale(df[['X1']])
age = scale(df[['X5']])
claim = scale(df.iloc[:, 11:17])[:, :, np.newaxis]
advance = scale(df.iloc[:, 17:23])[:, :, np.newaxis]

Embeddingするため値が1から始まるように(X2はそのまま、X3、X4は+1、X6〜X11は+3)する。

gender = df[['X2']].values
education = df[['X3']].values + 1
marriage = df[['X4']].values + 1
payment = df.iloc[:, 5:11].values + 3

d = [
    credit,
    gender,
    education,
    marriage,
    age,
    payment,
    claim,
    advance,
]

モデル構築

from keras.layers import *
from keras.models import Model
from keras.callbacks import *

def build_model():
    K.clear_session()

    inputs = []

    inp_credit = Input(shape=(1,))
    inputs.append(inp_credit)

    inp_gender = Input(shape=(None,))
    inputs.append(inp_gender)
    emb_gender = Flatten()(Embedding(len(np.unique(gender)), len(np.unique(gender)), input_length=1)(inp_gender))

    inp_education = Input(shape=(None,))
    inputs.append(inp_education)
    emb_education = Flatten()(Embedding(len(np.unique(education)), len(np.unique(education)), input_length=1)(inp_education))

    inp_marriage = Input(shape=(None,))
    inputs.append(inp_marriage)
    emb_marriage = Flatten()(Embedding(len(np.unique(marriage)), len(np.unique(marriage)), input_length=1)(inp_marriage))

    inp_age = Input(shape=(1,))
    inputs.append(inp_age)

    inp_payment = Input(shape=(None,))
    inputs.append(inp_payment)
    emb_payment = Embedding(len(np.unique(payment)), len(np.unique(payment)))(inp_payment)

    inp_claim = Input(shape=(None, 1,))
    inputs.append(inp_claim)

    inp_advance = Input(shape=(None, 1,))
    inputs.append(inp_advance)

    x = Concatenate()([emb_payment, inp_claim, inp_advance])
    x = CuDNNLSTM(32, return_sequences=True)(x)
    x = CuDNNLSTM(32, return_sequences=True)(x)
    x = GlobalMaxPooling1D()(x)

    x = Concatenate()([inp_credit, emb_gender, emb_education, emb_marriage, inp_age, x])
    x = Dense(32, activation='elu')(x)
    x = Dense(16, activation='elu')(x)
    x = Dense(2, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    return model

学習

from sklearn.model_selection import StratifiedKFold

pred_oof = np.zeros(len(train))
pred_sub = np.zeros(len(test))

n_splits = 5
skf = StratifiedKFold(n_splits, shuffle=True)
for train_idx, val_idx in skf.split(y, y):
    train_X = [x[train_idx] for x in d]
    val_X = [x[val_idx] for x in d]
    train_y, val_y = y[train_idx], y[val_idx]
    test_X =[x[len(y):] for x in d]

    model = build_model()

    checkpoint = ModelCheckpoint('weights.h5', monitor='val_loss', save_best_only=True, mode='min')
    history = model.fit(train_X, train_y, batch_size=512, epochs=100, validation_data=(val_X, val_y), callbacks=[checkpoint])
    model.load_weights('weights.h5')

    pred_oof[val_idx] = model.predict(val_X)[:, 1]
    pred_sub += model.predict(test_X)[:, 1] / n_splits

CVスコア

from sklearn.metrics import accuracy_score
accuracy_score(y, pred_oof > 0.5) # 0.8202592592592592

提出用ファイル作成

pd.DataFrame({'ID': test.index, 'Y': pred_sub}).to_csv('submission.csv', index=False)

Neural Networkによる解法 [LB: 0.831]

データ読み込み

データ前処理

モデル構築

学習

CVスコア

提出用ファイル作成

x_bam

hrappuccino

x_bam

ProbSpace_official

hrappuccino

new user