Entity Embeddings + MLP による実装および質問(LocalCV: 0.5454 PublicLB:0.5517)
はじめに
Entity Embeddingsとは、カテゴリ変数に対してembedding層を用いる手法です。NNの実装はあまりしたことがないので不備があるかと思います。そのためコメント等で教えていただけると幸いです。
実装紹介
前提となるディレクトリ構造は以下の通りです。
├── input
│ ├── train_data.csv
│ └── test_data.csv
├── notebooks
│ └── ****.ipynb(このノートブック)
├── models
└── output
import os
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import plot_model
INPUT_TRAIN = './input/train_data.csv'
INPUT_TEST = './input/test_data.csv'
N_SPLITS = 5
LIST_PLAYER = ['A1-', 'A2-', 'A3-', 'A4-', 'B1-', 'B2-', 'B3-', 'B4-']
SEED = 0
def set_seed(seed):
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
set_seed(SEED)
train = pd.read_csv(INPUT_TRAIN)
test = pd.read_csv(INPUT_TEST)
train_X = train.drop(columns=['y'], axis=1)
train_y = train[['y']]
def processing(train, test):
for player in LIST_PLAYER:
train[player+'level'].fillna(0, inplace=True)
test[player+'level'].fillna(0, inplace=True)
train[player+'level'] = pd.cut(train[player+'level'], bins=np.arange(0, 800, 100), right=False, labels=False).fillna(7)
test[player+'level'] = pd.cut(test[player+'level'], bins=np.arange(0, 800, 100), right=False, labels=False).fillna(7)
train.fillna('NaN', inplace=True)
test.fillna('NaN', inplace=True)
for col in ['mode', 'stage']:
le = LabelEncoder()
le.fit(train[col])
train[col] = le.transform(train[col])
test[col] = le.transform(test[col])
for col in ['weapon', 'rank']:
le = LabelEncoder()
le.fit(train['A4-'+col])
for player in LIST_PLAYER:
train[player+col] = le.transform(train[player+col])
test[player+col] = le.transform(test[player+col])
return train, test
train_X, test = processing(train_X, test)
def build_model():
inputs_weapon = L.Input(shape=(1, 8))
embed_weapon = L.Embedding(input_dim=140, output_dim=8)(inputs_weapon)
reshaped_weapon = tf.reshape(embed_weapon, shape=(-1, embed_weapon.shape[1], embed_weapon.shape[2]*embed_weapon.shape[3]))
inputs_rank = L.Input(shape=(1, 8))
embed_rank = L.Embedding(input_dim=13, output_dim=2)(inputs_rank)
reshaped_rank = tf.reshape(embed_rank, shape=(-1, embed_rank.shape[1], embed_rank.shape[2]*embed_rank.shape[3]))
inputs_level = L.Input(shape=(1, 8))
embed_level = L.Embedding(input_dim=8, output_dim=2)(inputs_level)
reshaped_level = tf.reshape(embed_level, shape=(-1, embed_level.shape[1], embed_level.shape[2]*embed_level.shape[3]))
inputs_mode = L.Input(shape=(1,))
embed_mode = L.Embedding(input_dim=5, output_dim=2)(inputs_mode)
inputs_stage = L.Input(shape=(1,))
embed_stage = L.Embedding(input_dim=23, output_dim=3)(inputs_stage)
concat_layers = [reshaped_weapon, reshaped_rank, reshaped_level, embed_mode, embed_stage]
x = L.Concatenate()(concat_layers)
x = L.Activation("relu")(x)
x = L.Dropout(0.5)(x)
x = L.Dense(128, activation="relu")(x)
x = L.Dropout(0.5)(x)
x = L.Dense(128, activation="relu")(x)
x = L.Dropout(0.5)(x)
x = L.Dense(128, activation="relu")(x)
x = L.Dropout(0.5)(x)
out = L.Dense(1, activation='sigmoid')(x)
inputs = [inputs_weapon, inputs_rank, inputs_level, inputs_mode, inputs_stage]
model = M.Model(inputs=inputs, outputs=out)
model.compile(
loss="binary_crossentropy",
optimizer="adam", metrics=["accuracy"]
)
return model
model = build_model()
def make_dataset(df):
return [df[[player+'weapon' for player in LIST_PLAYER]],
df[[player+'rank' for player in LIST_PLAYER]],
df[[player+'level' for player in LIST_PLAYER]],
df[['mode']],
df[['stage']]]
cv_scores = []
test_pred = []
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_X, train_y)):
model = build_model()
tra_X, val_X = train_X.iloc[tr_idx], train_X.iloc[va_idx]
tra_y, val_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
hist = model.fit(
make_dataset(tra_X), tra_y,
validation_data=(make_dataset(val_X), val_y),
epochs=50,
batch_size=512,
verbose=2,
callbacks=[ReduceLROnPlateau(patience=5), ModelCheckpoint(f'./models/model{i}.h5')]
)
model = build_model()
model.load_weights(f'./models/model{i}.h5')
tmp_pred = model.predict(make_dataset(val_X)).ravel()
tmp_cv = accuracy_score(np.where(tmp_pred>0.5, 1, 0), val_y)
cv_scores.append(tmp_cv)
print(tmp_cv)
y_pred = model.predict(make_dataset(test)).ravel()
test_pred.append(y_pred)
cv = sum(cv_scores)/len(cv_scores)
print(cv)
sub = pd.DataFrame(index=test.index+1, columns=['y'])
sub['y'] = np.where(sum(test_pred)/len(test_pred)>0.5, 1, 0)
sub.index.name = 'id'
sub.to_csv(f'./output/cv{cv}.csv')
質問
実装は以上になるのですが、正しく実装できているかがわかりません。現状、思いついている質問は以下の通りです。
- シード値固定は適切か
- このコードを走らせると「WARNING:tensorflow:Model was constructed with shape (None, 1, 8) for input Tensor("input_6:0", shape=(None, 1, 8), dtype=float32), but it was called on an input with incompatible shape (None, 8).」と警告が出るが原因がわからない
- embedding層がそもそも間違えていないか
- epochsやbatch size 等は適切か
- 全結合層+ReLUを3層追加したりDropout層を追加したが適切か(ノード数についても)
などなど。質問に答えてくださる方、気になる点がある方、NNのお気持ちを語ってくださる方がいらっしゃればぜひコメントお願いしたいです。