LightGBM Base line(CV=0.2228,LB=0.16746)

import pandas as pd
import numpy as np
import random
import os

from tqdm.notebook import tqdm
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
# メモリ使用量削減
def reduce_mem_usage(df, verbose=False):
    start_mem = df.memory_usage().sum() / 1024**2
    cols = df.columns.to_list()
    df_1 = df.select_dtypes(exclude=['integer', 'float'])
    df_2 = df.select_dtypes(include=['integer']).apply(pd.to_numeric, downcast='integer')
    df_3 = df.select_dtypes(include=['float']).apply(pd.to_numeric, downcast='float')
    df = df_1.join([df_2, df_3]).loc[:, cols]
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.2f}Mb->{:.2f}Mb({:.1f}% reduction)'.format(
            start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
# 乱数SEED初期化
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
# 設定
INPUT_PATH = os.path.join('..', 'input')
N_CLASS = 8
SEED = 42
N_SAMPLE = 5
N_FOLDS = 5
N_LOOPS = 3

game_info.csv 処理

# game_info.csv読み取り
game_df = reduce_mem_usage(pd.read_csv(os.path.join(INPUT_PATH, 'game_info.csv'), index_col=0))
display(game_df)
game_df.info()

train_data.csv 処理

# train_data.csv読み取り
tr_df = reduce_mem_usage(pd.read_csv(os.path.join(INPUT_PATH, 'train_data.csv')))
# 重複行除去
print('duplicated lines:', tr_df.drop('id', axis=1).duplicated().sum())
tr_df = tr_df[~tr_df.drop('id', axis=1).duplicated()]

# game_infoマージ
tr_df = pd.merge(tr_df, game_df.drop(['bgTop', 'bgBottom'], axis=1), on='gameID', how='left')

# 同名選手回避
f = tr_df['inning'].str.contains('表')
tr_df.loc[ f, 'batter'] = tr_df.loc[ f, 'batter'] + '@' + tr_df.loc[ f, 'topTeam'].astype(str)
tr_df.loc[~f, 'batter'] = tr_df.loc[~f, 'batter'] + '@' + tr_df.loc[~f, 'bottomTeam'].astype(str)
tr_df.loc[ f, 'pitcher'] = tr_df.loc[ f, 'pitcher'] + '@' + tr_df.loc[ f, 'bottomTeam'].astype(str)
tr_df.loc[~f, 'pitcher'] = tr_df.loc[~f, 'pitcher'] + '@' + tr_df.loc[~f, 'topTeam'].astype(str)
display(tr_df)
tr_df.info()

test_data.csv 処理

# test_data.csv読み取り
ts_df = reduce_mem_usage(pd.read_csv(os.path.join(INPUT_PATH, 'test_data.csv')))
# game_infoマージ
ts_df = pd.merge(ts_df, game_df.drop(['bgTop', 'bgBottom'], axis=1), on='gameID', how='left')

# 同名選手回避
f = ts_df['inning'].str.contains('表')
ts_df.loc[ f, 'batter'] = ts_df.loc[ f, 'batter'] + '@' + ts_df.loc[ f, 'topTeam'].astype(str)
ts_df.loc[~f, 'batter'] = ts_df.loc[~f, 'batter'] + '@' + ts_df.loc[~f, 'bottomTeam'].astype(str)
ts_df.loc[ f, 'pitcher'] = ts_df.loc[ f, 'pitcher'] + '@' + ts_df.loc[ f, 'bottomTeam'].astype(str)
ts_df.loc[~f, 'pitcher'] = ts_df.loc[~f, 'pitcher'] + '@' + ts_df.loc[~f, 'topTeam'].astype(str)
display(ts_df)
ts_df.info()

train、test 間の情報取得

# trainとtestに共通のピッチャーを取得
tr_pitcher = set(tr_df['pitcher'].unique())
ts_pitcher = set(ts_df['pitcher'].unique())
print(tr_df['pitcher'].isin(tr_pitcher & ts_pitcher).sum())
print(ts_df['pitcher'].isin(tr_pitcher & ts_pitcher).sum())

# trainとtestに共通のバッターを取得
tr_batter = set(tr_df['batter'].unique())
ts_batter = set(ts_df['batter'].unique())
print(tr_df['batter'].isin(tr_batter & ts_batter).sum())
print(ts_df['batter'].isin(tr_batter & ts_batter).sum())

train、test結合

# train_dataとtest_dataを結合
input_df = pd.concat([tr_df, ts_df], axis=0).reset_index(drop=True)

# pitcherHandとbatterHand
input_df['pitcherHand'] = input_df['pitcherHand'].fillna('R')
input_df['batterHand'] = input_df['batterHand'].fillna('R')

# 球種
input_df['pitchType'] = input_df['pitchType'].fillna('-')

# 球速
input_df['speed'] = input_df['speed'].str.replace('km/h', '').replace('-', '135').astype(float)
input_df['speed'] = input_df['speed'].fillna(0)

# 投球位置
input_df['ballPositionLabel'] = input_df['ballPositionLabel'].fillna('中心')

# 投球のX座標(1-21)
input_df['ballX'] = input_df['ballX'].fillna(0).astype(int)

# 投球のY座標(A-K)変換
input_df['ballY'] = input_df['ballY'].map({chr(ord('A')+i):i+1 for i in range(11)})
input_df['ballY'] = input_df['ballY'].fillna(0).astype(int)

# 打球方向(A-Z)
input_df['dir'] = input_df['ballY'].map({chr(ord('A')+i):i+1 for i in range(26)})
input_df['dir'] = input_df['dir'].fillna(0).astype(int)

# 打球距離
input_df['dist'] = input_df['dist'].fillna(0)

# 打球種類
input_df['battingType'] = input_df['battingType'].fillna('G')

# 投球結果がアウトか
input_df['isOuts'] = input_df['isOuts'].fillna('-1').astype(int)

display(input_df)
input_df.info()

del tr_df, ts_df, game_df

基礎特徴量

from sklearn.preprocessing import LabelEncoder
def get_base_features(input_df):
    seed_everything(seed=SEED)
    output_df = input_df.copy()

    output_df['inning'] = 2 * (output_df['inning'].str[0].astype(int) - 1) + output_df['inning'].str.contains('裏')

    output_df['pitcherCommon'] = output_df['pitcher']
    output_df['batterCommon'] = output_df['batter']
    output_df.loc[~(output_df['pitcherCommon'].isin(tr_pitcher & ts_pitcher)), 'pitcherCommon'] = np.nan
    output_df.loc[~(output_df['batterCommon'].isin(tr_batter & ts_batter)), 'batterCommon'] = np.nan

    # label encoding
    cat_cols = output_df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        f = output_df[col].notnull()
        output_df.loc[f, col] = LabelEncoder().fit_transform(output_df.loc[f, col].values)
        output_df.loc[~f, col] = -1
        output_df[col] = output_df[col].astype(int)
    
    output_df['inningHalf'] = output_df['inning'] % 2
    output_df['inningNumber'] = output_df['inning'] // 2
    output_df['outCount'] = output_df['inning'] * 3 + output_df['O']
    output_df['B_S_O'] = output_df['B'] + 4 * (output_df['S'] + 3 * output_df['O'])
    output_df['b1_b2_b3'] = output_df['b1'] * 1 + output_df['b2'] * 2 + output_df['b3'] * 4
    
    return reduce_mem_usage(output_df)

ランダムサンプリング

def random_sampling(input_df, n_sample=10):
    dfs = []
    tr_df = input_df[input_df['y'].notnull()].copy()
    ts_df = input_df[input_df['y'].isnull()].copy()
    for i in tqdm(range(n_sample)):
        df = tr_df.groupby(['gameID', 'outCount']).apply(lambda x: x.sample(n=1, random_state=i)).reset_index(drop=True)
        df['subGameID'] = df['gameID'] * n_sample + i
        dfs.append(df)
    ts_df['subGameID'] = ts_df['gameID'] * n_sample
    return pd.concat(dfs + [ts_df], axis=0)

集約特徴量

# 集約関数
def aggregation(input_df, group_keys, group_values, agg_methods):
    new_df = []
    for agg_method in agg_methods:
        for col in group_values:
            if callable(agg_method):
                agg_method_name = agg_method.__name__
            else:
                agg_method_name = agg_method
            new_col = f'agg_{agg_method_name}_{col}_grpby_' + '_'.join(group_keys)
            agg_df = input_df[[col]+group_keys].groupby(group_keys)[[col]].agg(agg_method)
            agg_df.columns = [new_col]
            new_df.append(agg_df)
    new_df = pd.concat(new_df, axis=1).reset_index()

    output_df = pd.merge(input_df, new_df, on=group_keys, how='left')
    return output_df, list(new_df.columns)
def get_agg_gameID_inningHalf_features(input_df):
    group_keys = ['subGameID', 'inningHalf']
    group_values = ['S', 'B', 'b1', 'b2', 'b3']
    agg_methods = ['mean', 'std']
    output_df, cols = aggregation(
        input_df, group_keys=group_keys, group_values=group_values, agg_methods=agg_methods)
    return reduce_mem_usage(output_df)

pivot table 特徴量

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

# pivot tabel を用いた特徴量
def get_pivot_NMF9_features(input_df, n, value_col):
    pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
    sc0 = MinMaxScaler().fit_transform(np.median(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,0::2,:], axis=-1))
    sc1 = MinMaxScaler().fit_transform(np.median(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,1::2,:], axis=-1))
    nmf = NMF(n_components=n, random_state=2021)
    nmf_df0 = pd.DataFrame(nmf.fit_transform(sc0), index=pivot_df.index).rename(
        columns=lambda x: f'pivot_{value_col}_NMF9T={x:02}')
    nmf_df1 = pd.DataFrame(nmf.fit_transform(sc1), index=pivot_df.index).rename(
        columns=lambda x: f'pivot_{value_col}_NMF9B={x:02}')
    nmf_df = pd.concat([nmf_df0, nmf_df1], axis=1)
    nmf_df = pd.merge(
        input_df, nmf_df, left_on='subGameID', right_index=True, how='left')
    return reduce_mem_usage(nmf_df)

# pivot tabel を用いた特徴量
def get_pivot_NMF27_features(input_df, n, value_col):
    pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
    sc0 = MinMaxScaler().fit_transform(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,0::2].reshape(-1,27))
    sc1 = MinMaxScaler().fit_transform(pivot_df.fillna(0).values.reshape(-1,54//3,3)[:,1::2].reshape(-1,27))
    nmf = NMF(n_components=n, random_state=2021)
    nmf_df0 = pd.DataFrame(nmf.fit_transform(sc0), index=pivot_df.index).rename(
        columns=lambda x: f'pivot_{value_col}_NMF27T={x:02}')
    nmf_df1 = pd.DataFrame(nmf.fit_transform(sc1), index=pivot_df.index).rename(
        columns=lambda x: f'pivot_{value_col}_NMF27B={x:02}')
    nmf_df = pd.concat([nmf_df0, nmf_df1], axis=1)
    nmf_df = pd.merge(
        input_df, nmf_df, left_on='subGameID', right_index=True, how='left')
    return reduce_mem_usage(nmf_df)

# pivot tabel を用いた特徴量
def get_pivot_NMF54_features(input_df, n, value_col):
    pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=np.median)
    sc = MinMaxScaler().fit_transform(pivot_df.fillna(0).values)
    nmf = NMF(n_components=n, random_state=2021)
    nmf_df = pd.DataFrame(nmf.fit_transform(sc), index=pivot_df.index).rename(
        columns=lambda x: f'pivot_{value_col}_NMF54={x:02}')
    nmf_df = pd.merge(
        input_df, nmf_df, left_on='subGameID', right_index=True, how='left')
    return reduce_mem_usage(nmf_df)

前後特徴量

def get_diff_feature(input_df, value_col, periods, in_inning=True, aggfunc=np.median):
    pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=aggfunc)
    if in_inning:
        dfs = []
        for inning in range(9):
            df0 = pivot_df.loc[:, [out+inning*6 for out in range(0,3)]].diff(periods, axis=1)
            df1 = pivot_df.loc[:, [out+inning*6 for out in range(3,6)]].diff(periods, axis=1)
            dfs += [df0, df1]
        pivot_df = pd.concat(dfs, axis=1).stack()
    else:
        df0 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(0,3)]].diff(periods, axis=1)
        df1 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(3,6)]].diff(periods, axis=1)
        pivot_df = pd.concat([df0, df1], axis=1).stack()
    return pivot_df

def get_shift_feature(input_df, value_col, periods, in_inning=True, aggfunc=np.median):
    pivot_df = pd.pivot_table(input_df, index='subGameID', columns='outCount', values=value_col, aggfunc=aggfunc)
    if in_inning:
        dfs = []
        for inning in range(9):
            df0 = pivot_df.loc[:, [out+inning*6 for out in range(0,3)]].shift(periods, axis=1)
            df1 = pivot_df.loc[:, [out+inning*6 for out in range(3,6)]].shift(periods, axis=1)
            dfs += [df0, df1]
        pivot_df = pd.concat(dfs, axis=1).stack()
    else:
        df0 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(0,3)]].shift(periods, axis=1)
        df1 = pivot_df.loc[:, [out+inning*6 for inning in range(9) for out in range(3,6)]].shift(periods, axis=1)
        pivot_df = pd.concat([df0, df1], axis=1).stack()
    return pivot_df
def get_next_data(input_df, value_col, in_inning=True, nan_value=None):
    pivot_df = get_shift_feature(input_df, value_col, periods=-1, in_inning=in_inning)
    pivot_df.name = 'next_' + value_col
    output_df = pd.merge(
        input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
    if nan_value is not None:
        output_df[pivot_df.name].fillna(nan_value, inplace=True)
    return output_df

def get_prev_data(input_df, value_col, in_inning=True, nan_value=None):
    pivot_df = get_shift_feature(input_df, value_col, periods=1, in_inning=in_inning)
    pivot_df.name = 'prev_' + value_col
    output_df = pd.merge(
        input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
    if nan_value is not None:
        output_df[pivot_df.name].fillna(nan_value, inplace=True)
    return output_df
    
def get_next_diff(input_df, value_col, in_inning=True, nan_value=None):
    pivot_df = get_diff_feature(input_df, value_col, periods=-1, in_inning=in_inning)
    pivot_df.name = 'next_diff_' + value_col
    output_df = pd.merge(
        input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
    if nan_value is not None:
        output_df[pivot_df.name].fillna(nan_value, inplace=True)
    return output_df

def get_prev_diff(input_df, value_col, in_inning=True, nan_value=None):
    pivot_df = get_diff_feature(input_df, value_col, periods=1, in_inning=in_inning)
    pivot_df.name = 'prev_diff_' + value_col
    output_df = pd.merge(
        input_df, pivot_df, left_on=['subGameID', 'outCount'], right_index=True, how='left')
    if nan_value is not None:
        output_df[pivot_df.name].fillna(nan_value, inplace=True)
    return output_df

TF-IDF

def get_tfidf(input_df, term_col, document_col):
    output_df = input_df.copy()
    output_df['dummy'] = 0
    tf1 = output_df[[document_col, term_col, 'dummy']].groupby([document_col, term_col])['dummy'].count()
    tf1.name = 'tf1'
    tf2 = output_df[[document_col, term_col, 'dummy']].groupby([document_col])['dummy'].count()
    tf2.name = 'tf2'
    idf1 = output_df[document_col].nunique()
    idf2 = output_df[[document_col, term_col, 'dummy']].groupby([term_col])[document_col].nunique()
    idf2.name = 'idf2'
    output_df = pd.merge(output_df, tf1, left_on=[document_col, term_col], right_index=True, how='left')
    output_df = pd.merge(output_df, tf2, left_on=[document_col], right_index=True, how='left')
    output_df['idf1'] = idf1
    output_df = pd.merge(output_df, idf2, left_on=[term_col], right_index=True, how='left')
    col_name = 'tfidf_' + term_col + '_in_' + document_col
    tf = np.log(1 + (1 + output_df['tf1']) / (1 + output_df['tf2']))
    idf = 1 + np.log((1 + output_df['idf1']) / (1 + output_df['idf2']))
    output_df[col_name] = tf * idf
    return output_df.drop(['tf1', 'tf2', 'idf1', 'idf2', 'dummy'], axis=1)

打席スキップ数

def get_skip(input_df):
    output_df = input_df.copy()

    next_skip_map = {}
    prev_skip_map = {}
    for key, group in output_df.groupby(['subGameID', 'inningHalf']):
        n = len(group)
        dist_map = {}
        batter = group.sort_values('outCount')['batter']
        for i in range(n - 1):
            b1 = batter.iloc[i]
            for d in range(1, 5):
                if i + d >= n:
                    break
                b2 = batter.iloc[i + d]

                if (b1, b2) in dist_map.keys():
                    if dist_map[(b1, b2)] < d:
                        dist_map[(b1, b2)] = d
                else:
                    dist_map[(b1, b2)] = d
            
        for i in range(len(batter) - 1):
            next_skip_map[batter.index[i]] = dist_map[(batter.iloc[i], batter.iloc[i+1])]
        for i in range(1, len(batter)):
            prev_skip_map[batter.index[i]] = dist_map[(batter.iloc[i-1], batter.iloc[i])]

    output_df['next_skip'] = output_df.index.map(next_skip_map).fillna(0).astype(np.int8)
    output_df['prev_skip'] = output_df.index.map(prev_skip_map).fillna(0).astype(np.int8)
    return output_df

特徴量演算

# 特徴量作成用の関数を実行する関数
def preprocess(input_df):
    seed_everything(seed=SEED)
    output_df = input_df.copy()

    # aggrigation
    output_df = get_agg_gameID_inningHalf_features(output_df)    

    # pivot
    output_df = get_pivot_NMF9_features(output_df, n=2, value_col='b1_b2_b3')
    output_df = get_pivot_NMF27_features(output_df, n=2, value_col='b1_b2_b3')
    output_df = get_pivot_NMF54_features(output_df, n=2, value_col='b1_b2_b3')

    # next/previous
    output_df = get_next_data(output_df, value_col='b1_b2_b3', nan_value=8)
    output_df = get_next_diff(output_df, value_col='b1_b2_b3', nan_value=8)
    output_df = get_prev_data(output_df, value_col='b1_b2_b3', nan_value=8)
    output_df = get_prev_diff(output_df, value_col='b1_b2_b3', nan_value=8)

    # TF-IDF
    output_df = get_tfidf(output_df, term_col='batter', document_col='subGameID')

    # skip
    output_df = get_skip(output_df)

    return output_df
base_df = get_base_features(input_df)
display(base_df)
base_df.info()
sampling_df = random_sampling(base_df, n_sample=N_SAMPLE)
display(sampling_df)
sampling_df.info()
prep_df = preprocess(sampling_df)
prep_df.info()

学習/予測

drop_cols = [
    'id',
    'gameID',
    'subGameID',

    'pitchType',
    'speed',
    'ballPositionLabel',
    'ballX',
    'ballY',
    'dir',
    'dist',
    'battingType',
    'isOuts',

    'startDayTime',
    'startTime',
    'pitcher',
    'batter',
]
target_col = 'y'
group_col = 'gameID'
def f1_macro(y_true, y_pred):
    return 'f1_macro', f1_score(y_true, np.argmax(y_pred.reshape(N_CLASS,-1), axis=0), average='macro'), True

# GroupKFold with random shuffle with a sklearn-like structure
from sklearn.model_selection import KFold
class RandomGroupKFold:
    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = groups.unique()
        for tr_group_idx, va_group_idx in kf.split(unique_ids):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(groups.isin(tr_group))[0]
            val_idx = np.where(groups.isin(va_group))[0]
            yield train_idx, val_idx
def train_predict(df, drop_cols, target_col, group_col):
    lgb_params = {
        'objective': 'multiclass',
        'num_class': N_CLASS,
        'importance_type': 'gain',
        'n_jobs': -1,
        'class_weight': 'balanced',

        'n_estimators': 500, 'learning_rate': 0.1,
        'boosting_type': 'gbdt',
        'num_leaves': 32, 'colsample_bytree': 0.5614,
        'subsample': 0.8975, 'subsample_freq': 73,
        'min_child_samples': 14, 'min_child_weight': 34.08, 'max_bin': 138,
        'reg_alpha': 0.05796, 'reg_lambda': 0.0002102
    }

    seed_everything(seed=SEED)
    train = df[df[target_col].notnull()]
    test = df[df[target_col].isnull()]
    y_preds = []
    scores = []
    for n in range(N_LOOPS):
        kf = RandomGroupKFold(n_splits=N_FOLDS, random_state=SEED + n)
        lgb_params['random_state'] = SEED + n

        y_oof = np.zeros(len(train), dtype=np.float32)
        for fold, (tr_idx, vl_idx) in enumerate(kf.split(X=train, groups=train[group_col])):
            tr_fold = train.iloc[tr_idx]
            vl_fold = train.iloc[vl_idx]
            X_train, y_train = tr_fold.drop([target_col] + drop_cols, axis=1), tr_fold[target_col]
            X_valid, y_valid = vl_fold.drop([target_col] + drop_cols, axis=1), vl_fold[target_col]
            X_test = test.drop([target_col] + drop_cols, axis=1)

            model = lgb.LGBMClassifier(**lgb_params)
            model.fit(
                X_train, y_train,
                eval_set=(X_valid, y_valid),
                eval_metric=f1_macro,
                verbose=False,
                early_stopping_rounds=100 if lgb_params['boosting_type'] != 'dart' else None
            )

            y_oof[vl_idx] = model.predict(X_valid)
            score = f1_score(y_valid, y_oof[vl_idx], average='macro')
            print(f'loop:{n}, fold:{fold}, score:{score}')
            y_preds.append(model.predict_proba(X_test))

        score = f1_score(train[target_col], y_oof, average='macro')
        print(f'loop:{n}, score:{score}')
        scores.append(score)
    
    y_pred = np.mean(y_preds, axis=0)
    score = np.mean(scores)
    print(f'score ave.:{np.mean(score)}')
    return y_pred, score
y_pred, score = train_predict(prep_df, drop_cols, target_col, group_col)
#テスト結果の出力
submit_df = pd.DataFrame({'y': y_pred.argmax(axis=1).astype(int)})
submit_df.index.name = 'id'
submit_df.to_csv(f'sample_{score:.4f}.csv')

添付データ

  • sample1_lgbm.ipynb?X-Amz-Expires=600&X-Amz-Date=20211207T220617Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。