Neural Networkによる解法 [LB: 0.831]
本文4000文字制限はつらい。コメントもらえると元気になります。
データ読み込み
import numpy as np
import pandas as pd
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
y = train['y']
train.drop(['y', 'id'], axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)
df = pd.concat([train, test])
データ前処理
X1
、X5
、X12
〜X17
、X18
〜X23
は平均0、分散1に正規化する。
from sklearn.preprocessing import scale
credit = scale(df[['X1']])
age = scale(df[['X5']])
claim = scale(df.iloc[:, 11:17])[:, :, np.newaxis]
advance = scale(df.iloc[:, 17:23])[:, :, np.newaxis]
Embeddingするため値が1から始まるように(X2
はそのまま、X3
、X4
は+1、X6
〜X11
は+3)する。
gender = df[['X2']].values
education = df[['X3']].values + 1
marriage = df[['X4']].values + 1
payment = df.iloc[:, 5:11].values + 3
d = [
credit,
gender,
education,
marriage,
age,
payment,
claim,
advance,
]
モデル構築
from keras.layers import *
from keras.models import Model
from keras.callbacks import *
def build_model():
K.clear_session()
inputs = []
inp_credit = Input(shape=(1,))
inputs.append(inp_credit)
inp_gender = Input(shape=(None,))
inputs.append(inp_gender)
emb_gender = Flatten()(Embedding(len(np.unique(gender)), len(np.unique(gender)), input_length=1)(inp_gender))
inp_education = Input(shape=(None,))
inputs.append(inp_education)
emb_education = Flatten()(Embedding(len(np.unique(education)), len(np.unique(education)), input_length=1)(inp_education))
inp_marriage = Input(shape=(None,))
inputs.append(inp_marriage)
emb_marriage = Flatten()(Embedding(len(np.unique(marriage)), len(np.unique(marriage)), input_length=1)(inp_marriage))
inp_age = Input(shape=(1,))
inputs.append(inp_age)
inp_payment = Input(shape=(None,))
inputs.append(inp_payment)
emb_payment = Embedding(len(np.unique(payment)), len(np.unique(payment)))(inp_payment)
inp_claim = Input(shape=(None, 1,))
inputs.append(inp_claim)
inp_advance = Input(shape=(None, 1,))
inputs.append(inp_advance)
x = Concatenate()([emb_payment, inp_claim, inp_advance])
x = CuDNNLSTM(32, return_sequences=True)(x)
x = CuDNNLSTM(32, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Concatenate()([inp_credit, emb_gender, emb_education, emb_marriage, inp_age, x])
x = Dense(32, activation='elu')(x)
x = Dense(16, activation='elu')(x)
x = Dense(2, activation='softmax')(x)
model = Model(inputs=inputs, outputs=x)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
return model
学習
from sklearn.model_selection import StratifiedKFold
pred_oof = np.zeros(len(train))
pred_sub = np.zeros(len(test))
n_splits = 5
skf = StratifiedKFold(n_splits, shuffle=True)
for train_idx, val_idx in skf.split(y, y):
train_X = [x[train_idx] for x in d]
val_X = [x[val_idx] for x in d]
train_y, val_y = y[train_idx], y[val_idx]
test_X =[x[len(y):] for x in d]
model = build_model()
checkpoint = ModelCheckpoint('weights.h5', monitor='val_loss', save_best_only=True, mode='min')
history = model.fit(train_X, train_y, batch_size=512, epochs=100, validation_data=(val_X, val_y), callbacks=[checkpoint])
model.load_weights('weights.h5')
pred_oof[val_idx] = model.predict(val_X)[:, 1]
pred_sub += model.predict(test_X)[:, 1] / n_splits
CVスコア
from sklearn.metrics import accuracy_score
accuracy_score(y, pred_oof > 0.5)
提出用ファイル作成
pd.DataFrame({'ID': test.index, 'Y': pred_sub}).to_csv('submission.csv', index=False)