退会したユーザー
ProbSpaceの運営の皆様、共にコンペに参加し、コンペを盛り上げてくださった参加者の皆様に感謝を申し上げます。次のコンペも楽しみにしています。
Oreginさんが序盤からすごいスコアをたたき出していて、なんとか追いつこうと試行錯誤してみましたが全く歯が立ちませんでした。
公開されたOriginさんの解法は非常に勉強になりました。
今回のコンペは、trainデータとtestデータの間で特徴が大きく異なるものでした。
そこで、trainデータを可能な限りtestデータの形状に近づける処理をしました。
特徴量は下記のような前後のデータの関係を表すものが有効でした:
speedやballPositionLabelなどtrainデータにしかない特徴量は使用しませんでした。
評価指標がMacro-F1ということで、2ベースヒット、3ベースヒット、ホームランをいかに当てるかがスコアアップの肝だったと思います。前後のランナーやバッターの関係情報から推測しようとしましたが、専用の2値分類モデルで予測したOreginさんの戦略にはかないませんでした。問題の性質に合わせたモデルの作り方が重要だと実感しました。
import pandas as pd
import numpy as np
import random
import os
import gc
from tqdm.notebook import tqdm
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
# メモリ使用量削減
def reduce_mem_usage(df, verbose=False):
start_mem = df.memory_usage().sum() / 1024**2
cols = df.columns.to_list()
df_1 = df.select_dtypes(exclude=['integer', 'float'])
df_2 = df.select_dtypes(include=['integer']).apply(pd.to_numeric, downcast='integer')
df_3 = df.select_dtypes(include=['float']).apply(pd.to_numeric, downcast='float')
df = df_1.join([df_2, df_3]).loc[:, cols].reset_index(drop=True)
end_mem = df.memory_usage().sum() / 1024**2
if verbose:
print('{:.2f}Mb->{:.2f}Mb({:.1f}% reduction)'.format(
start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
# 乱数SEED初期化
def seed_everything(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
# 設定
INPUT_PATH = os.path.join('..', 'input')
N_CLASS = 8
SEED=42
N_SAMPLE=100
# game_info.csv読み取り
game_df = reduce_mem_usage(pd.read_csv(os.path.join(INPUT_PATH, 'game_info.csv'), index_col=0))
teamMap = {
'巨人':0,'ヤクルト':1,'DeNA':2,'中日':3,'阪神':4,'広島':5,
'西武':6,'日本ハム':7,'ロッテ':8,'楽天':9,'オリックス':10,'ソフトバンク':11,
}
game_df['topTeam'] = game_df['topTeam'].map(teamMap)
game_df['bottomTeam'] = game_df['bottomTeam'].map(teamMap)
placeMap = {
'東京ドーム':0,'神宮':1,'横浜':2,'ナゴヤドーム':3,'甲子園':4,'マツダスタジアム':5,
'メットライフ':6,'札幌ドーム':7,'ZOZOマリン':8,'楽天生命パーク':9,'京セラD大阪':10,'PayPayドーム':11,
'ほっと神戸':12,
}
game_df['place'] = game_df['place'].map(placeMap)
game_df['startDayTime'] = pd.to_datetime(game_df['startDayTime'])
display(game_df)
game_df.info()
# train_data.csv読み取り
tr_df = reduce_mem_usage(pd.read_csv(os.path.join(INPUT_PATH, 'train_data.csv')))
# 重複行除去
print('duplicated lines:', tr_df.drop('id', axis=1).duplicated().sum())
tr_df = tr_df[~tr_df.drop('id', axis=1).duplicated()]
# game_infoマージ
tr_df = pd.merge(tr_df, game_df.drop(['bgTop', 'bgBottom'], axis=1), on='gameID', how='left')
# 同名選手回避
f = tr_df['inning'].str.contains('表')
tr_df.loc[ f, 'batter'] = tr_df.loc[ f, 'batter'] + '@' + tr_df.loc[ f, 'topTeam'].astype(str)
tr_df.loc[~f, 'batter'] = tr_df.loc[~f, 'batter'] + '@' + tr_df.loc[~f, 'bottomTeam'].astype(str)
tr_df.loc[ f, 'pitcher'] = tr_df.loc[ f, 'pitcher'] + '@' + tr_df.loc[ f, 'bottomTeam'].astype(str)
tr_df.loc[~f, 'pitcher'] = tr_df.loc[~f, 'pitcher'] + '@' + tr_df.loc[~f, 'topTeam'].astype(str)
display(tr_df)
tr_df.info()
# test_data.csv読み取り
ts_df = reduce_mem_usage(pd.read_csv(os.path.join(INPUT_PATH, 'test_data_improvement.csv')))
# game_infoマージ
ts_df = pd.merge(ts_df, game_df.drop(['bgTop', 'bgBottom'], axis=1), on='gameID', how='left')
# 同名選手回避
f = ts_df['inning'].str.contains('表')
ts_df.loc[ f, 'batter'] = ts_df.loc[ f, 'batter'] + '@' + ts_df.loc[ f, 'topTeam'].astype(str)
ts_df.loc[~f, 'batter'] = ts_df.loc[~f, 'batter'] + '@' + ts_df.loc[~f, 'bottomTeam'].astype(str)
ts_df.loc[ f, 'pitcher'] = ts_df.loc[ f, 'pitcher'] + '@' + ts_df.loc[ f, 'bottomTeam'].astype(str)
ts_df.loc[~f, 'pitcher'] = ts_df.loc[~f, 'pitcher'] + '@' + ts_df.loc[~f, 'topTeam'].astype(str)
display(ts_df)
ts_df.info()
# trainとtestに共通のピッチャーを取得
tr_pitcher = set(tr_df['pitcher'].unique())
ts_pitcher = set(ts_df['pitcher'].unique())
print(tr_df['pitcher'].isin(tr_pitcher & ts_pitcher).sum())
print(ts_df['pitcher'].isin(tr_pitcher & ts_pitcher).sum())
# trainとtestに共通のバッターを取得
tr_batter = set(tr_df['batter'].unique())
ts_batter = set(ts_df['batter'].unique())
print(tr_df['batter'].isin(tr_batter & ts_batter).sum())
print(ts_df['batter'].isin(tr_batter & ts_batter).sum())
# train_dataとtest_dataを結合
input_df = pd.concat([tr_df, ts_df], axis=0).reset_index(drop=True)
del tr_df, ts_df, game_df
gc.collect()
# pitcherHandとbatterHand
input_df['pitcherHand'] = input_df['pitcherHand'].fillna('R')
input_df['batterHand'] = input_df['batterHand'].fillna('R')
# 球種
input_df['pitchType'] = input_df['pitchType'].fillna('-')
# 球速
input_df['speed'] = input_df['speed'].str.replace('km/h', '').replace('-', '135').astype(float)
input_df['speed'] = input_df['speed'].fillna(0)
# 投球位置
input_df['ballPositionLabel'] = input_df['ballPositionLabel'].fillna('中心')
# 投球のX座標(1-21)
input_df['ballX'] = input_df['ballX'].fillna(0).astype(int)
# 投球のY座標(A-K)変換
input_df['ballY'] = input_df['ballY'].map({chr(ord('A')+i):i+1 for i in range(11)})
input_df['ballY'] = input_df['ballY'].fillna(0).astype(int)
# 打球方向(A-Z)
input_df['dir'] = input_df['ballY'].map({chr(ord('A')+i):i+1 for i in range(26)})
input_df['dir'] = input_df['dir'].fillna(0).astype(int)
# 打球距離
input_df['dist'] = input_df['dist'].fillna(0)
# 打球種類
input_df['battingType'] = input_df['battingType'].fillna('G')
# 投球結果がアウトか
input_df['isOuts'] = input_df['isOuts'].fillna('-1').astype(int)
display(input_df)
input_df.info()
from sklearn.preprocessing import LabelEncoder
def get_base_features(input_df):
seed_everything(seed=SEED)
output_df = input_df.copy()
output_df['inning'] = 2 * (output_df['inning'].str[0].astype(int) - 1) + output_df['inning'].str.contains('裏')
output_df['pitcherCommon'] = output_df['pitcher']
output_df['batterCommon'] = output_df['batter']
output_df.loc[~(output_df['pitcherCommon'].isin(tr_pitcher & ts_pitcher)), 'pitcherCommon'] = np.nan
output_df.loc[~(output_df['batterCommon'].isin(tr_batter & ts_batter)), 'batterCommon'] = np.nan
output_df['startTime'] = output_df['startDayTime'].dt.hour
# label encoding
cat_cols = output_df.select_dtypes(include=['object']).columns
for col in cat_cols:
f = output_df[col].notnull()
output_df.loc[f, col] = LabelEncoder().fit_transform(output_df.loc[f, col].values)
output_df.loc[~f, col] = -1
output_df[col] = output_df[col].astype(int)
# count encoding
count_cols = ['pitcher', 'batter']
for col in count_cols:
f = output_df[col].notnull()
new_col = 'cnt_' + col
count_map = output_df[['id', col]].groupby([col])['id'].count().to_dict()
output_df[new_col] = 0
output_df.loc[f, new_col] = output_df.loc[f, col].map(count_map)
output_df[new_col] = output_df[new_col].astype(int)
output_df['inningHalf'] = output_df['inning'] % 2
output_df['inningNumber'] = output_df['inning'] // 2
output_df['outCount'] = output_df['inning'] * 3 + output_df['O']
output_df['B_S_O'] = output_df['B'] + 4 * (output_df['S'] + 3 * output_df['O'])
output_df['b1_b2_b3'] = output_df['b1'] * 1 + output_df['b2'] * 2 + output_df['b3'] * 4
output_df['ballIdx'] = output_df['S'] - output_df['B']
output_df['baseIdx'] = (
output_df['b3'] * 3
+ (~output_df['b3'] & output_df['b2']) * 2
+ (~output_df['b3'] & ~output_df['b2'] & output_df['b1']) * 1
- output_df['O']
)
# for target encoding
for i in range(8):
output_df.loc[output_df['y'].notnull(), f'y{i}'] = (output_df['y'] == i).astype(float)
return reduce_mem_usage(output_df)
def random_sampling(input_df, n_sample=10, random_state=1):
dfs = []
tr_df = input_df[input_df['y'].notnull()].copy()
ts_df = input_df[input_df['y'].isnull()].copy()
for i in tqdm(range(n_sample)):
df = tr_df.groupby(['gameID', 'outCount']).apply(
lambda x: x.sample(n=1, random_state=random_state+i)).reset_index(drop=True)
df['subGameID'] = df['gameID'] * n_sample + i
dfs.append(df)
ts_df['subGameID'] = ts_df['gameID'] * n_sample
return pd.concat(dfs + [ts_df], axis=0)
# 集約関数
def aggregation(input_df, group_keys, group_values, agg_methods):
new_df = []
for agg_method in agg_methods:
for col in group_values:
if callable(agg_method):
agg_method_name = agg_method.__name__
else:
agg_method_name = agg_method
new_col = f'agg_{agg_method_name}_{col}_grpby_' + '_'.join(group_keys)
agg_df = input_df[[col]+group_keys].groupby(group_keys)[[col]].agg(agg_method)
agg_df.columns = [new_col]
new_df.append(agg_df)
new_df = pd.concat(new_df, axis=1).reset_index()
return new_df
# 集約関数
def smooth_aggregation(input_df, group_keys, group_values, agg_methods):
new_df = []
for agg_method in agg_methods:
all_agg_df = input_df[group_values].agg(agg_method)
for col in group_values:
if callable(agg_method):
agg_method_name = agg_method.__name__
else:
agg_method_name = agg_method
new_col = f'agg_{agg_method_name}_{col}_grpby_' + '_'.join(group_keys)
groupby = input_df[[col]+group_keys].groupby(group_keys)
count = groupby[[col]].count()
r = count / (10 + count)
agg_df = r * groupby[[col]].agg(agg_method) + (1 - r) * all_agg_df[[col]]
agg_df.columns = [new_col]
new_df.append(agg_df)
new_df = pd.concat(new_df, axis=1).reset_index()
return new_df
# target encoding関数
from sklearn.model_selection import GroupKFold
def target_encoding(input_df, input_col, group_col, target_col, n_splits=5):
output_df = input_df.copy()
kf = GroupKFold(n_splits=n_splits)
new_col = 'tgt_' + target_col + '_grpby_' + input_col
output_df[new_col] = np.nan
for tr_idx, vl_idx in kf.split(X=output_df, groups=output_df[group_col]):
tr_idx = output_df.index[tr_idx]
vl_idx = output_df.index[vl_idx]
groupby = output_df.loc[tr_idx, [input_col, target_col]].dropna().groupby(input_col)
count = groupby[target_col].count()
r = count / (10 + count)
target_map = (r * groupby[target_col].mean() + (1 - r) * output_df.loc[tr_idx, target_col].mean()).to_dict()
output_df.loc[vl_idx, new_col] = output_df.loc[vl_idx, input_col].map(target_map).fillna(
output_df.loc[tr_idx, target_col].mean())
return output_df
def get_diff_feature(input_df, value_col, periods, in_inning=True, aggfunc=np.median):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=aggfunc)
if in_inning:
dfs = []
for inning in range(9):
df0 = pivot_df.loc[:, [out+inning*6 for out in range(0,3)]].diff(periods, axis=1)
df1 = pivot_df.loc[:, [out+inning*6 for out in range(3,6)]].diff(periods, axis=1)
dfs += [df0, df1]
pivot_df = pd.concat(dfs, axis=1).stack()
else:
df0 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(0,3)]].diff(periods, axis=1)
df1 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(3,6)]].diff(periods, axis=1)
pivot_df = pd.concat([df0, df1], axis=1).stack()
return pivot_df
def get_shift_feature(input_df, value_col, periods, in_inning=True, aggfunc=np.median):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=aggfunc)
if in_inning:
dfs = []
for inning in range(9):
df0 = pivot_df.loc[:, [out+inning*6 for out in range(0,3)]].shift(periods, axis=1)
df1 = pivot_df.loc[:, [out+inning*6 for out in range(3,6)]].shift(periods, axis=1)
dfs += [df0, df1]
pivot_df = pd.concat(dfs, axis=1).stack()
else:
df0 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(0,3)]].shift(periods, axis=1)
df1 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(3,6)]].shift(periods, axis=1)
pivot_df = pd.concat([df0, df1], axis=1).stack()
return pivot_df
def get_next_data(input_df, value_col, in_inning=True, nan_value=None):
pivot_df = get_shift_feature(input_df, value_col, periods=-1, in_inning=in_inning)
pivot_df.name = 'next_' + value_col
output_df = pd.merge(
input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
if nan_value is not None:
output_df[pivot_df.name].fillna(nan_value, inplace=True)
return output_df
def get_prev_data(input_df, value_col, in_inning=True, nan_value=None):
pivot_df = get_shift_feature(input_df, value_col, periods=1, in_inning=in_inning)
pivot_df.name = 'prev_' + value_col
output_df = pd.merge(
input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
if nan_value is not None:
output_df[pivot_df.name].fillna(nan_value, inplace=True)
return output_df
def get_next_diff(input_df, value_col, in_inning=True, nan_value=None):
pivot_df = get_diff_feature(input_df, value_col, periods=-1, in_inning=in_inning)
pivot_df.name = 'next_diff_' + value_col
output_df = pd.merge(
input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
if nan_value is not None:
output_df[pivot_df.name].fillna(nan_value, inplace=True)
return output_df
def get_prev_diff(input_df, value_col, in_inning=True, nan_value=None):
pivot_df = get_diff_feature(input_df, value_col, periods=1, in_inning=in_inning)
pivot_df.name = 'prev_diff_' + value_col
output_df = pd.merge(
input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
if nan_value is not None:
output_df[pivot_df.name].fillna(nan_value, inplace=True)
return output_df
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# pivot tabel を用いた特徴量
def get_pivot_PCA18_features(input_df, n, value_col):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
sc0 = StandardScaler().fit_transform(np.median(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,0::2,:], axis=-1))
sc1 = StandardScaler().fit_transform(np.median(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,1::2,:], axis=-1))
pca = PCA(n_components=n, random_state=2021)
pca_df0 = pd.DataFrame(pca.fit_transform(sc0), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_PCA9T={x:02}')
pca_df1 = pd.DataFrame(pca.fit_transform(sc1), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_PCA9B={x:02}')
pca_df = pd.concat([pca_df0, pca_df1], axis=1)
pca_df = pd.merge(
input_df, pca_df, left_on='subGameID', right_index=True, how='left')
return reduce_mem_usage(pca_df)
# pivot tabel を用いた特徴量
def get_pivot_PCA27_features(input_df, n, value_col):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
sc0 = StandardScaler().fit_transform(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,0::2].reshape(-1,27))
sc1 = StandardScaler().fit_transform(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,1::2].reshape(-1,27))
pca = PCA(n_components=n, random_state=2021)
pca_df0 = pd.DataFrame(pca.fit_transform(sc0), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_PCA27T={x:02}')
pca_df1 = pd.DataFrame(pca.fit_transform(sc1), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_PCA27B={x:02}')
pca_df = pd.concat([pca_df0, pca_df1], axis=1)
pca_df = pd.merge(
input_df, pca_df, left_on='subGameID', right_index=True, how='left')
return reduce_mem_usage(pca_df)
# pivot tabel を用いた特徴量
def get_pivot_PCA54_features(input_df, n, value_col):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
sc = StandardScaler().fit_transform(pivot_df.fillna(0).values)
pca = PCA(n_components=n, random_state=2021)
pca_df = pd.DataFrame(pca.fit_transform(sc), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_PCA54={x:02}')
pca_df = pd.merge(
input_df, pca_df, left_on='subGameID', right_index=True, how='left')
return reduce_mem_usage(pca_df)
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
# pivot tabel を用いた特徴量
def get_pivot_NMF18_features(input_df, n, value_col):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
sc0 = MinMaxScaler().fit_transform(np.median(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,0::2,:], axis=-1))
sc1 = MinMaxScaler().fit_transform(np.median(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,1::2,:], axis=-1))
nmf = NMF(n_components=n, random_state=2021)
nmf_df0 = pd.DataFrame(nmf.fit_transform(sc0), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_NMF9T={x:02}')
nmf_df1 = pd.DataFrame(nmf.fit_transform(sc1), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_NMF9B={x:02}')
nmf_df = pd.concat([nmf_df0, nmf_df1], axis=1)
nmf_df = pd.merge(
input_df, nmf_df, left_on='subGameID', right_index=True, how='left')
return reduce_mem_usage(nmf_df)
# pivot tabel を用いた特徴量
def get_pivot_NMF27_features(input_df, n, value_col):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
sc0 = MinMaxScaler().fit_transform(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,0::2].reshape(-1,27))
sc1 = MinMaxScaler().fit_transform(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,1::2].reshape(-1,27))
nmf = NMF(n_components=n, random_state=2021)
nmf_df0 = pd.DataFrame(nmf.fit_transform(sc0), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_NMF27T={x:02}')
nmf_df1 = pd.DataFrame(nmf.fit_transform(sc1), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_NMF27B={x:02}')
nmf_df = pd.concat([nmf_df0, nmf_df1], axis=1)
nmf_df = pd.merge(
input_df, nmf_df, left_on='subGameID', right_index=True, how='left')
return reduce_mem_usage(nmf_df)
# pivot tabel を用いた特徴量
def get_pivot_NMF54_features(input_df, n, value_col):
pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
sc = MinMaxScaler().fit_transform(pivot_df.fillna(0).values)
nmf = NMF(n_components=n, random_state=2021)
nmf_df = pd.DataFrame(nmf.fit_transform(sc), index=pivot_df.index).rename(
columns=lambda x: f'pivot_{value_col}_NMF54={x:02}')
nmf_df = pd.merge(
input_df, nmf_df, left_on='subGameID', right_index=True, how='left')
return reduce_mem_usage(nmf_df)
def get_tfidf(input_df, term_col, document_col):
output_df = input_df.copy()
output_df['dummy'] = 0
tf1 = output_df[[document_col, term_col, 'dummy']].groupby([document_col, term_col])['dummy'].count()
tf1.name = 'tf1'
tf2 = output_df[[document_col, term_col, 'dummy']].groupby([document_col])['dummy'].count()
tf2.name = 'tf2'
idf1 = output_df[document_col].nunique()
idf2 = output_df[[document_col, term_col, 'dummy']].groupby([term_col])[document_col].nunique()
idf2.name = 'idf2'
output_df = pd.merge(output_df, tf1, left_on=[document_col, term_col], right_index=True, how='left')
output_df = pd.merge(output_df, tf2, left_on=[document_col], right_index=True, how='left')
output_df['idf1'] = idf1
output_df = pd.merge(output_df, idf2, left_on=[term_col], right_index=True, how='left')
col_name = 'tfidf_' + term_col + '_in_' + document_col
tf = np.log(1 + (1 + output_df['tf1']) / (1 + output_df['tf2']))
idf = 1 + np.log((1 + output_df['idf1']) / (1 + output_df['idf2']))
output_df[col_name] = tf * idf
return output_df.drop(['tf1', 'tf2', 'idf1', 'idf2', 'dummy'], axis=1)
def get_skip(input_df):
output_df = input_df.copy()
next_skip_map = {}
prev_skip_map = {}
for key, group in output_df.groupby(['subGameID', 'inningHalf']):
n = len(group)
dist_map = {}
batter = group.sort_values('outCount')['batter']
for i in range(n - 1):
b1 = batter.iloc[i]
for d in range(1, 5):
if i + d >= n:
break
b2 = batter.iloc[i + d]
if (b1, b2) in dist_map.keys():
if dist_map[(b1, b2)] < d:
dist_map[(b1, b2)] = d
else:
dist_map[(b1, b2)] = d
for i in range(len(batter) - 1):
next_skip_map[batter.index[i]] = dist_map[(batter.iloc[i], batter.iloc[i+1])]
for i in range(1, len(batter)):
prev_skip_map[batter.index[i]] = dist_map[(batter.iloc[i-1], batter.iloc[i])]
output_df['next_skip'] = output_df.index.map(next_skip_map).fillna(0).astype(np.int8)
output_df['prev_skip'] = output_df.index.map(prev_skip_map).fillna(0).astype(np.int8)
return output_df
# 特徴量作成用の関数を実行する関数
def preprocess(input_df, base_df):
seed_everything(seed=SEED)
output_df = input_df.copy()
# aggrigation
df = base_df.groupby(['gameID', 'outCount']).median().reset_index()
agg_df = aggregation(df, ['gameID'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['gameID'], how='left')
agg_df = aggregation(df, ['gameID', 'inningHalf'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['gameID', 'inningHalf'], how='left')
agg_df = aggregation(df, ['b1_b2_b3'], ['S', 'B'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['b1_b2_b3'], how='left')
agg_df = aggregation(df, ['B_S_O'], ['b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on= ['B_S_O'], how='left')
agg_df = aggregation(df, ['ballIdx'], ['O', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['ballIdx'], how='left')
agg_df = aggregation(df, ['baseIdx'], ['B', 'S'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['baseIdx'], how='left')
agg_df = aggregation(df, ['inningNumber'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['inningNumber'], how='left')
agg_df = aggregation(df, ['outCount'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['outCount'], how='left')
agg_df = smooth_aggregation(df, ['pitcher'], ['baseIdx', 'ballIdx'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['pitcher'], how='left')
output_df[agg_df.columns] = output_df[agg_df.columns].fillna(0)
agg_df = smooth_aggregation(df, ['batter'], ['baseIdx', 'ballIdx'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['batter'], how='left')
output_df[agg_df.columns] = output_df[agg_df.columns].fillna(0)
# pivot
output_df = get_pivot_PCA18_features(output_df, n=4, value_col='ballIdx')
output_df = get_pivot_PCA18_features(output_df, n=4, value_col='baseIdx')
output_df = get_pivot_PCA27_features(output_df, n=6, value_col='ballIdx')
output_df = get_pivot_PCA27_features(output_df, n=6, value_col='baseIdx')
output_df = get_pivot_PCA54_features(output_df, n=8, value_col='ballIdx')
output_df = get_pivot_PCA54_features(output_df, n=8, value_col='baseIdx')
output_df = get_pivot_NMF18_features(output_df, n=2, value_col='ballIdx')
output_df = get_pivot_NMF18_features(output_df, n=2, value_col='baseIdx')
output_df = get_pivot_NMF27_features(output_df, n=2, value_col='ballIdx')
output_df = get_pivot_NMF27_features(output_df, n=2, value_col='baseIdx')
output_df = get_pivot_NMF54_features(output_df, n=2, value_col='ballIdx')
output_df = get_pivot_NMF54_features(output_df, n=2, value_col='baseIdx')
# next/previous
output_df = get_next_data(output_df, value_col='b1_b2_b3', nan_value=8)
output_df = get_next_diff(output_df, value_col='b1_b2_b3', nan_value=8)
output_df = get_prev_data(output_df, value_col='b1_b2_b3', nan_value=8)
output_df = get_prev_diff(output_df, value_col='b1_b2_b3', nan_value=8)
output_df['runnerCombi'] = output_df['b1_b2_b3'] + 8 * output_df['next_b1_b2_b3']
# TF-IDF
output_df = get_tfidf(output_df, term_col='batter', document_col='subGameID')
output_df = get_tfidf(output_df, term_col='b1_b2_b3', document_col='subGameID')
output_df = get_tfidf(output_df, term_col='B_S_O', document_col='subGameID')
# skip
output_df = get_skip(output_df)
output_df['move4'] = (output_df['next_b1_b2_b3'] == 0) & (output_df['next_skip'] == 2)
output_df['move3'] = (output_df['next_b1_b2_b3'] == 4) & (output_df['next_skip'] == 2)
output_df['move2'] = (output_df['next_b1_b2_b3']%4 == 2) & (output_df['next_skip'] == 2)
output_df['move1'] = (output_df['next_b1_b2_b3']%2 == 1) & (output_df['next_skip'] == 2)
# target encoding
enc_cols = [
'bottomTeam', 'topTeam', 'pitcherCommon', 'batterCommon',
'B', 'S', 'O', 'b1', 'b2', 'b3',
'b1_b2_b3', 'B_S_O',
'ballIdx', 'baseIdx',
'next_b1_b2_b3', 'next_diff_b1_b2_b3', 'prev_b1_b2_b3', 'prev_diff_b1_b2_b3', 'runnerCombi',
]
for col in enc_cols:
for i in range(N_CLASS):
output_df = target_encoding(output_df, col, 'gameID', f'y{i}')
return reduce_mem_usage(output_df)
base_df = get_base_features(input_df)
display(base_df)
base_df.info()
sampling_df = random_sampling(base_df, n_sample=N_SAMPLE)
display(sampling_df)
sampling_df.info()
prep_df = preprocess(sampling_df, base_df)
prep_df.info()
del input_df, base_df, sampling_df
gc.collect()
# 欠損値表示
def display_missing(df):
total = df.isnull().sum().sort_values(ascending=False)
percent = total / df.shape[0]
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
display(missing_data[missing_data['Total'] > 0])
pd.set_option('display.max_rows', 200)
display_missing(prep_df)
# 設定
N_FOLDS = 5
N_LOOPS = 3
SEED = 42
drop_cols = [
'id',
'gameID',
'subGameID',
'pitchType',
'speed',
'ballPositionLabel',
'ballX',
'ballY',
'dir',
'dist',
'battingType',
'isOuts',
'startDayTime',
'pitcher',
'batter',
] + [f'y{i}' for i in range(8)]
target_col = 'y'
group_col = 'gameID'
tr_ratio = prep_df.loc[prep_df['y'].notnull(), 'B_S_O'].value_counts(normalize=True).sort_index()
ts_ratio = prep_df.loc[prep_df['y'].isnull(), 'B_S_O'].value_counts(normalize=True).sort_index()
ratio_dict = (ts_ratio / tr_ratio).to_dict()
sample_weight = prep_df.loc[prep_df['y'].notnull(), 'B_S_O'].map(ratio_dict)
def f1_macro(y_true, y_pred):
return 'f1_macro', f1_score(y_true, np.argmax(y_pred.reshape(N_CLASS,-1), axis=0), average='macro'), True
def feature_importance(prep_df, drop_cols, target_col, group_col):
lgb_params = {
'objective': 'multiclass',
'num_class': N_CLASS,
'importance_type': 'gain',
'n_jobs': -1,
'class_weight': 'balanced',
'random_state': SEED,
'n_estimators': 500, 'learning_rate': 0.1,
'boosting_type': 'gbdt',
'num_leaves': 20, 'colsample_bytree': 0.6,
'subsample': 0.9, 'subsample_freq': 4,
'min_child_samples': 70, 'min_child_weight': 4, 'max_bin': 160,
'reg_alpha': 0.002, 'reg_lambda': 0.0004
}
seed_everything(seed=SEED)
train = prep_df[prep_df[target_col].notnull()]
kf = GroupKFold(n_splits=N_FOLDS)
y_oof = np.zeros(len(train), dtype=np.float32)
importances = []
for fold,(tr_idx,vl_idx) in enumerate(kf.split(X=train, groups=train[group_col])):
tr_fold = train.iloc[tr_idx]
vl_fold = train.iloc[vl_idx]
X_train, y_train = tr_fold.drop([target_col] + drop_cols, axis=1), tr_fold[target_col]
X_valid, y_valid = vl_fold.drop([target_col] + drop_cols, axis=1), vl_fold[target_col]
model = lgb.LGBMClassifier(**lgb_params)
model.fit(
X_train, y_train,
eval_set=(X_valid, y_valid),
eval_metric=f1_macro,
sample_weight=sample_weight.iloc[tr_idx],
verbose=False,
early_stopping_rounds=50
)
importances.append(
pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['importance'])
)
y_oof[vl_idx] = model.predict(X_valid)
score = f1_score(y_valid, y_oof[vl_idx], average='macro')
print(f'fold:{fold}, score:{score}')
score = f1_score(train[target_col], y_oof, average='macro')
print(f'score:{score}')
importance = pd.concat(importances, axis=1).mean(axis=1)
return importance.sort_values(axis=0, ascending=False)
importance = feature_importance(prep_df, drop_cols, target_col, group_col)
# GroupKFold with random shuffle with a sklearn-like structure
from sklearn.model_selection import KFold
class RandomGroupKFold:
def __init__(self, n_splits=4, shuffle=True, random_state=42):
self.n_splits = n_splits
self.shuffle = shuffle
self.random_state = random_state
def get_n_splits(self, X=None, y=None, groups=None):
return self.n_splits
def split(self, X=None, y=None, groups=None):
kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
unique_ids = groups.unique()
for tr_group_idx, va_group_idx in kf.split(unique_ids):
# split group
tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
train_idx = np.where(groups.isin(tr_group))[0]
val_idx = np.where(groups.isin(va_group))[0]
yield train_idx, val_idx
# 設定
N_FOLDS = 5
N_LOOPS = 5
drop_cols += importance.index[200:].to_list()
def f1_macro(y_true, y_pred):
return 'f1_macro', f1_score(y_true, np.argmax(y_pred.reshape(N_CLASS,-1), axis=0), average='macro'), True
def train_predict(prep_df, drop_cols, target_col, group_col):
lgb_params = {
'objective': 'multiclass',
'num_class': N_CLASS,
'importance_type': 'gain',
'n_jobs': -1,
'class_weight': 'balanced',
'n_estimators': 500, 'learning_rate': 0.1,
'boosting_type': 'gbdt',
'num_leaves': 28, 'colsample_bytree': 0.5578623473768867,
'subsample': 0.3346569905107538, 'subsample_freq': 53,
'min_child_samples': 92, 'min_child_weight': 48.29721873685813, 'max_bin': 172,
'reg_alpha': 0.01599491014809227, 'reg_lambda': 0.00011822980622796684
}
seed_everything(seed=SEED)
train = prep_df[prep_df[target_col].notnull()]
test = prep_df[prep_df[target_col].isnull()]
y_preds = []
scores = []
for n in range(N_LOOPS):
kf = RandomGroupKFold(n_splits=N_FOLDS, random_state=SEED + n)
lgb_params['random_state'] = SEED + n
y_oof = np.zeros(len(train), dtype=np.float32)
for fold, (tr_idx, vl_idx) in enumerate(kf.split(X=train, groups=train[group_col])):
tr_fold = train.iloc[tr_idx]
vl_fold = train.iloc[vl_idx]
X_train, y_train = tr_fold.drop([target_col] + drop_cols, axis=1), tr_fold[target_col]
X_valid, y_valid = vl_fold.drop([target_col] + drop_cols, axis=1), vl_fold[target_col]
X_test = test.drop([target_col] + drop_cols, axis=1)
model = lgb.LGBMClassifier(**lgb_params)
model.fit(
X_train, y_train,
eval_set=(X_valid, y_valid),
eval_metric=f1_macro,#'logloss',
sample_weight=sample_weight.iloc[tr_idx],
verbose=False,
early_stopping_rounds=100 if lgb_params['boosting_type'] != 'dart' else None
)
y_oof[vl_idx] = model.predict(X_valid)
score = f1_score(y_valid, y_oof[vl_idx], average='macro')
print(f'loop:{n}, fold:{fold}, score:{score}')
y_preds.append(model.predict_proba(X_test))
score = f1_score(train[target_col], y_oof, average='macro')
print(f'loop:{n}, score:{score}')
scores.append(score)
y_pred = np.mean(y_preds, axis=0)
score = np.mean(scores)
print(f'score ave.:{np.mean(score)}')
return y_pred, score
y_pred, score = train_predict(prep_df, drop_cols, target_col, group_col)
#テスト結果の出力
submit_df = pd.DataFrame({'y': y_pred.argmax(axis=1).astype(int)})
submit_df.index.name = 'id'
submit_df.to_csv(f'sub_{score:.4f}.csv')