SUMMARY

まずはじめに運営/参加者の皆様ありがとうございました。

野球は好きなスポーツなのでデータを見て楽しみながら参戦することができました。 Oreginさんが圧倒的なスコアで序盤から最後まで1位で走り切ったのが印象的なコンペになりました。個人的な目線では、この予測ターゲットはかなり運ゲーなのではないかと最後まで思っていました。だからこそOreginさんのスコアは衝撃的でした。

1位とは大きく離されたスコアでアプローチも非常にシンプルなもので、おそらくみなさんにとって別に驚きのあるものではないと思いますが、供養のために解法をシェアします。

こちらのGithubにもnotebookを公開しました。 -> https://github.com/rauta0127/probspace_basball_pub

アプローチ

1.　前処理

今回のコンペにおいて、前処理は非常に重要なものだったと思います。

特にDT-SNさんのシェアで使われていた出塁状態、ボールストライクカウントの数値化はかなり有効でした。　

当初はカテゴリ変数として扱っておりスコアが出なかったのですが、この数値化により大きくスコアを改善しました。これは勉強になりました。　

また打者投手の利き手などに欠損が見られた部分の補完は、両打ち実績がある打者の場合は投手の利き手とは逆の手を補完する判定を組み込むなど、なるべくデータを綺麗にすることを努めました。　

2. 特徴量エンジニアリング

基本的には集計特徴量をベースにしています。集計特徴量については、過学習を避けるために学習データとテストデータで正規性検定により分布が異なるものを除く処理を行いました。

また試合ごとの打者の出現順番の特徴量（厳密には打順ではないですが、ここでは打順と呼びます。）も効きました。これらを利用した試合における打者/投手/打順のTfidfも効きました。

興味深かったのが、打者（batterCommon）ごとの打順（batting_order_in_subgameID）の統計特徴量がテストスコアに対しては有効でした。

また過学習を出来るだけ避けれないかと集計特徴量などはPCAで圧縮を行なっています。

3. モデル

LightGBMの5seeds平均アンサンブルです。個人的なポイントは、今回のタスクではローカルCVスコアを上げすぎるとパブリックスコアが大きく下がってしまう傾向がありました。そのため過学習せぬようmax_binパラメータを小さくするなどの工夫を行いました。

4. Fold分割

gameIDごとのRandomGroupKFoldです。当初StritifiedGroupKFoldも試していましたがリーダーボードとの相関が高かったのは結果RandomGroupKFoldでした。

以下解法のnotebookです。また対戦宜しくお願い致します。

# ----- Import common library -----
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from tqdm import tqdm_notebook as tqdm
from glob import glob
import gc
import pickle
from time import time, sleep
import json
import pytz
import random
pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display

INPUT_DIR = './input'

# ====================================================
# CONFIG
# ====================================================

class CONFIG():
    def __init__(self):
        self.debug = False
        self.target = 'y'
        self.num_class = 8
        self.sampling_num = 10
        self.seeds = [2021, 2022, 2023, 2024, 2025]
        self.how_split = 'RandomGroupKFold'
        self.n_splits = 5
        self.group_col = 'gameID'
        
CFG = CONFIG()
    
print (f"{CFG.__dict__}")

{'debug': False, 'target': 'y', 'num_class': 8, 'sampling_num': 10, 'seeds': [2021, 2022, 2023, 2024, 2025], 'how_split': 'RandomGroupKFold', 'n_splits': 5, 'group_col': 'gameID'}

def read_data(input_dir):
    # 投球結果(0:ボール, 1:ストライク, 2:ファウル, 3:アウト, 4:シングルヒット, 5:二塁打, 6:三塁打, 7:ホームラン)
    train = pd.read_csv(f'{input_dir}/train_data.csv')
    test = pd.read_csv(f'{input_dir}/test_data_improvement.csv')
    game_info = pd.read_csv(f'{input_dir}/game_info.csv')
    print(f'train shape = {train.shape}')
    print(f'test shape = {test.shape}')
    
    sample_submission = test[['id']].copy()
    sample_submission['y'] = 0
    print(f'sample_submission shape = {sample_submission.shape}')

    train['test'] = 0
    test['test'] = 1
    df = pd.concat([train, test]).reset_index(drop=True)
    df = df.merge(game_info, on=['gameID'], how='left')

    df.drop(columns=['Unnamed: 0'], inplace=True)
    df = df.drop_duplicates(['totalPitchingCount', 'B', 'S', 'O', 'pitcher', 'batter', 'gameID', 'inning', 'startDayTime'])
    df['startDayTime'] = pd.to_datetime(df['startDayTime'])
    df['date'] = df['startDayTime'].dt.date
    df = df.sort_values(['startDayTime', 'gameID', 'inning', 'O', 'totalPitchingCount']).reset_index(drop=True)
    return df, sample_submission

def create_diffence_team_feature(topTeam_values, bottomTeam_values, inning_top_values):
    new_values = topTeam_values.copy()
    new_values[inning_top_values==0] = topTeam_values[inning_top_values==0].astype(object)
    new_values[inning_top_values==1] = bottomTeam_values[inning_top_values==1].astype(object)
    return new_values

def create_offence_team_feature(topTeam_values, bottomTeam_values, inning_top_values):
    new_values = topTeam_values.copy()
    new_values[inning_top_values==1] = topTeam_values[inning_top_values==1].astype(object)
    new_values[inning_top_values==0] = bottomTeam_values[inning_top_values==0].astype(object)
    return new_values

def create_pitcher_team_feature(pitcher_values, topTeam_values, bottomTeam_values, inning_top_values):
    new_values = pitcher_values.copy()
    str_values = np.full(new_values.shape[0],"@")
    new_values[inning_top_values==0] = pitcher_values[inning_top_values==0].astype(str).astype(object) + str_values[inning_top_values==0] + topTeam_values[inning_top_values==0].astype(object)
    new_values[inning_top_values==1] = pitcher_values[inning_top_values==1].astype(str).astype(object) + str_values[inning_top_values==1] + bottomTeam_values[inning_top_values==1].astype(object)
    return new_values

def create_batter_team_feature(batter_values, topTeam_values, bottomTeam_values, inning_top_values):
    new_values = batter_values.copy()
    str_values = np.full(new_values.shape[0],"@")
    new_values[inning_top_values==1] = batter_values[inning_top_values==1].astype(str).astype(object) + str_values[inning_top_values==1] + topTeam_values[inning_top_values==1].astype(object)
    new_values[inning_top_values==0] = batter_values[inning_top_values==0].astype(str).astype(object) + str_values[inning_top_values==0] + bottomTeam_values[inning_top_values==0].astype(object)
    return new_values

def fillna_pitcherHand(df):
    pitcherHand_df = df[pd.notnull(df['pitcherHand'])].groupby('pitcher')['pitcherHand'].max().reset_index()
    df.drop(columns=['pitcherHand'], inplace=True)
    df = df.merge(pitcherHand_df, on='pitcher', how='left')
    return df

def batter_isPitcher(df):
    pitcher_df = df[pd.notnull(df['pitcherHand'])].groupby('pitcher').size().reset_index()
    pitcher_df['batter'] = pitcher_df['pitcher']
    pitcher_df['batter_isPitcher'] = 1
    pitcher_df = pitcher_df[['batter', 'batter_isPitcher']]
    df = df.merge(pitcher_df, on='batter', how='left')
    df['batter_isPitcher'] = df['batter_isPitcher'].fillna(0)
    return df

def convert_batterHand(x, batterHand_dict):
    try: 
        return batterHand_dict[x]
    except: 
        return pd.np.nan

def fillna_batterHand(df):
    batterHand_nunique = df[pd.notnull(df['batterHand'])].groupby('batter')['batterHand'].nunique()
    doubleHand_batter = list(batterHand_nunique[batterHand_nunique==2].index)
    cond = (pd.isnull(df['batterHand'])&(df['batter'].isin(doubleHand_batter)))
    df.loc[cond, 'batterHand'] = df.loc[cond, 'pitcherHand'].map(lambda x: {'R': 'L', 'L': 'R'}[x])

    batterHand_dict = df[pd.notnull(df['batterHand'])].groupby('batter')['batterHand'].max().reset_index().to_dict()
    cond = pd.isnull(df['batterHand'])
    df.loc[cond, 'batterHand'] = df.loc[cond, 'batter'].map(lambda x: convert_batterHand(x, batterHand_dict))

    cond = pd.isnull(df['batterHand'])
    df.loc[cond, 'batterHand'] = df.loc[cond, 'pitcherHand'].map(lambda x: {'R': 'L', 'L': 'R'}[x])
    return df

def create_base_features(df):
    
    df['BS'] = df['B']*(10**0) + df['S']*(10**1)
    df['BSO'] = df['B']*(10**0) + df['S']*(10**1) + df['O']*(10**2)

    df['inning_num'] = df['inning'].map(lambda x: float(x.split('回')[0]))
    df['inning_num'] = df['inning_num'] * 2
    df['inning_top'] = df['inning'].map(lambda x: 1 if x.split('回')[-1]=='表' else 0)
    df['inning_num'] = df[['inning_num', 'inning_top']].apply(lambda x: x['inning_num']-1 if x['inning_top']==1 else x['inning_num'], axis=1)
    df['inning_num_half'] = df['inning_num'] // 2
    df['out_cumsum'] = (df['inning_num_half']-1)*3 + df['O']
    
    place_dict = {
        'PayPayドーム': 0, 
        '京セラD大阪': 1, 
        'メットライフ': 2,
        '横浜': 3, 
        '神宮': 4, 
        '東京ドーム': 5, 
        'ZOZOマリン': 6,
        '楽天生命パーク': 7, 
        'ナゴヤドーム': 8, 
        '札幌ドーム': 9, 
        'マツダスタジアム': 10, 
        '甲子園': 11, 
        'ほっと神戸': 12
    }
    df['place'] = df['place'].map(lambda x: place_dict[x])

    df['pitcherTeam'] = create_diffence_team_feature(df['topTeam'].values, df['bottomTeam'].values, df['inning_top'].values)
    df['batterTeam'] = create_offence_team_feature(df['topTeam'].values, df['bottomTeam'].values, df['inning_top'].values)
    df['pitcher'] = create_pitcher_team_feature(df['pitcher'].values, df['topTeam'].values, df['bottomTeam'].values, df['inning_top'].values)
    df['batter'] = create_batter_team_feature(df['batter'].values, df['topTeam'].values, df['bottomTeam'].values, df['inning_top'].values)

    # trainとtestに共通のピッチャーを取得
    train_pitcher = set(df[df['test']==0]['pitcher'].unique())
    test_pitcher = set(df[df['test']==1]['pitcher'].unique())

    # trainとtestに共通のバッターを取得
    train_batter = set(df[df['test']==0]['batter'].unique())
    test_batter = set(df[df['test']==1]['batter'].unique())

    df['pitcherCommon'] = df['pitcher']
    df['batterCommon'] = df['batter']
    df.loc[~(df['pitcherCommon'].isin(train_pitcher & test_pitcher)), 'pitcherCommon'] = np.nan
    df.loc[~(df['batterCommon'].isin(train_batter & test_batter)), 'batterCommon'] = np.nan
    df['pitcherCommon'] = create_pitcher_team_feature(df['pitcherCommon'].values, df['topTeam'].values, df['bottomTeam'].values, df['inning_top'].values)
    df['batterCommon'] = create_batter_team_feature(df['batterCommon'].values, df['topTeam'].values, df['bottomTeam'].values, df['inning_top'].values)
    
    df['base_all'] = df['b1']*(10**0) + df['b2']*(10**1) + df['b3']*(10**2)

    return df

def fast_groupby_sampling_idx(df, groupby_cols, sample_size, seed=42):
    np.random.seed(seed)
    return np.concatenate(list(map(lambda x: np.random.choice(x, sample_size), list(df.groupby(groupby_cols, as_index=False).indices.values()))))


def sampling(train_df, sampling_num):
    new_train_df = pd.DataFrame()
    for i in tqdm(range(sampling_num)):
        new_train_df_sub = train_df.loc[fast_groupby_sampling_idx(train_df, groupby_cols=['gameID', 'inning', 'O'], sample_size=1, seed=i)]
        new_train_df_sub['subgameID'] = ((new_train_df_sub['gameID']*100).astype(str) + str(i).zfill(2)).astype(float)
        new_train_df = new_train_df.append(new_train_df_sub)
    return new_train_df

def create_pre_forward_group_features(df, groupby_cols, target_col):
    groupby_str = '_'.join(groupby_cols)
    df[f'{target_col}_{groupby_str}_pre1'] = df.groupby(groupby_cols)[target_col].shift(1)
    df[f'{target_col}_{groupby_str}_pre2'] = df.groupby(groupby_cols)[target_col].shift(2)
    df[f'{target_col}_{groupby_str}_forward1'] = df.groupby(groupby_cols)[target_col].shift(-1)
    df[f'{target_col}_{groupby_str}_forward2'] = df.groupby(groupby_cols)[target_col].shift(-2)
    
    if df[target_col].dtype in ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']:
        df[f'{target_col}_{groupby_str}_diff_pre1'] = df[target_col] - df[f'{target_col}_{groupby_str}_pre1']
        df[f'{target_col}_{groupby_str}_diff_pre2'] = df[target_col] - df[f'{target_col}_{groupby_str}_pre2']
        df[f'{target_col}_{groupby_str}_diff_pre3'] = df[f'{target_col}_{groupby_str}_pre1'] - df[f'{target_col}_{groupby_str}_pre2']

        df[f'{target_col}_{groupby_str}_diff_forward1'] = df[target_col] - df[f'{target_col}_{groupby_str}_forward1']
        df[f'{target_col}_{groupby_str}_diff_forward2'] = df[target_col] - df[f'{target_col}_{groupby_str}_forward2']
        df[f'{target_col}_{groupby_str}_diff_forward3'] = df[f'{target_col}_{groupby_str}_forward1'] - df[f'{target_col}_{groupby_str}_forward2']
    
    else:
        df[f'{target_col}_{groupby_str}_diff_pre1'] = df[[target_col, f'{target_col}_{groupby_str}_pre1']].astype(str).apply(lambda x: 1 if x[target_col] < x[f'{target_col}_{groupby_str}_pre1'] else 0, axis=1)
        df[f'{target_col}_{groupby_str}_diff_pre2'] = df[[target_col, f'{target_col}_{groupby_str}_pre2']].astype(str).apply(lambda x: 1 if x[target_col] < x[f'{target_col}_{groupby_str}_pre2'] else 0, axis=1)
        df[f'{target_col}_{groupby_str}_diff_pre3'] = df[[f'{target_col}_{groupby_str}_pre1', f'{target_col}_{groupby_str}_pre2']].astype(str).apply(lambda x: 1 if x[f'{target_col}_{groupby_str}_pre1'] < x[f'{target_col}_{groupby_str}_pre2'] else 0, axis=1)

        df[f'{target_col}_{groupby_str}_diff_forward1'] = df[[target_col, f'{target_col}_{groupby_str}_forward1']].astype(str).apply(lambda x: 1 if x[target_col] < x[f'{target_col}_{groupby_str}_forward1'] else 0, axis=1)
        df[f'{target_col}_{groupby_str}_diff_forward2'] = df[[target_col, f'{target_col}_{groupby_str}_forward2']].astype(str).apply(lambda x: 1 if x[target_col] < x[f'{target_col}_{groupby_str}_forward2'] else 0, axis=1)
        df[f'{target_col}_{groupby_str}_diff_forward3'] = df[[f'{target_col}_{groupby_str}_forward1', f'{target_col}_{groupby_str}_forward2']].astype(str).apply(lambda x: 1 if x[f'{target_col}_{groupby_str}_forward1'] < x[f'{target_col}_{groupby_str}_forward2'] else 0, axis=1)

    return df


def has_9thbottom(df):
    df_g = df.groupby(['subgameID'])['inning'].unique().map(lambda x: 1 if '9回裏' in x else 0).reset_index().rename(columns={'inning': 'has_9thbottom'})
    df = df.merge(df_g, on=['subgameID'], how='left')
    return df

def has_out(df):
    df_g = df.groupby(['subgameID', 'inning'])['O'].unique().map(lambda x: 1 if 2 in x else 0).reset_index().rename(columns={'O': 'has_out2'})
    df = df.merge(df_g, on=['subgameID', 'inning'], how='left')
    df_g = df.groupby(['subgameID', 'inning'])['O'].unique().map(lambda x: 1 if 1 in x else 0).reset_index().rename(columns={'O': 'has_out1'})
    df = df.merge(df_g, on=['subgameID', 'inning'], how='left')
    return df

def create_pitching_order(df):
    df_g = df.groupby(['subgameID', 'pitcherTeam'])['pitcher'].unique().explode().reset_index()
    df_g['pitching_order_in_subgameID'] = df_g.groupby(['subgameID', 'pitcherTeam']).cumcount() + 1
    df = df.merge(df_g, on=['subgameID', 'pitcherTeam', 'pitcher'], how='left')
    return df

def create_batting_order(df):
    df_g = df.groupby(['subgameID', 'batterTeam'])['batter'].unique().explode().reset_index()
    df_g['batting_order_in_subgameID'] = df_g.groupby(['subgameID', 'batterTeam']).cumcount() + 1
    df = df.merge(df_g, on=['subgameID', 'batterTeam', 'batter'], how='left')
    return df

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, NMF, TruncatedSVD

def create_batter_tfidf(df, n_components=5, compressors=['pca']):
    df_g = df.groupby(['subgameID', 'batterTeam'])['batter'].agg(list).reset_index()
    df_g['batter'] = df_g['batter'].map(lambda x: ' '.join(x))
    vectorizer = TfidfVectorizer()
    input_x = vectorizer.fit_transform(df_g['batter'].values)
    input_x = pd.DataFrame(input_x.toarray())
    mms = MinMaxScaler()
    input_x = mms.fit_transform(input_x)
    
    for c in compressors:
        if c == 'pca':
            compressor = PCA(n_components=n_components, random_state=42)
        elif c == 'nmf':
            compressor = NMF(n_components=n_components, random_state=42)
        elif c == 'svd':
            compressor = TruncatedSVD(n_components=n_components, random_state=42)
        compressed = compressor.fit_transform(input_x)
        compressed_df = pd.DataFrame(compressed, columns=[f'batter_tfidf_{c}_{n}' for n in range(n_components)])
        df_g_compressed = pd.concat([df_g, compressed_df], axis=1)
        df_g_compressed.drop(columns=['batter'], inplace=True)
        df = df.merge(df_g_compressed, on=['subgameID', 'batterTeam'], how='left')

    return df

def create_pitcher_tfidf(df, n_components=5, compressors=['pca']):
    df_g = df.groupby(['subgameID', 'pitcherTeam'])['pitcher'].agg(list).reset_index()
    df_g['pitcher'] = df_g['pitcher'].map(lambda x: ' '.join(x))
    vectorizer = TfidfVectorizer()
    input_x = vectorizer.fit_transform(df_g['pitcher'].values)
    input_x = pd.DataFrame(input_x.toarray())
    mms = MinMaxScaler()
    input_x = mms.fit_transform(input_x)

    for c in compressors:
        if c == 'pca':
            compressor = PCA(n_components=n_components, random_state=42)
        elif c == 'nmf':
            compressor = NMF(n_components=n_components, random_state=42)
        elif c == 'svd':
            compressor = TruncatedSVD(n_components=n_components, random_state=42)
        compressed = compressor.fit_transform(input_x)
        compressed_df = pd.DataFrame(compressed, columns=[f'pitcher_tfidf_{c}_{n}' for n in range(n_components)])
        df_g_compressed = pd.concat([df_g, compressed_df], axis=1)
        df_g_compressed.drop(columns=['pitcher'], inplace=True)
        df = df.merge(df_g_compressed, on=['subgameID', 'pitcherTeam'], how='left')

    return df

def create_batting_order_tfidf(df, n_components=5, compressors=['pca']):
    df_g = df.groupby(['subgameID', 'batterTeam'])['batting_order_in_subgameID'].agg(list).reset_index()
    df_g['batting_order_in_subgameID'] = df_g['batting_order_in_subgameID'].map(lambda x: ' '.join(map('order{}'.format, x)))
    vectorizer = TfidfVectorizer()
    input_x = vectorizer.fit_transform(df_g['batting_order_in_subgameID'].values)
    input_x = pd.DataFrame(input_x.toarray())
    mms = MinMaxScaler()
    input_x = mms.fit_transform(input_x)
    
    for c in compressors:
        if c == 'pca':
            compressor = PCA(n_components=n_components, random_state=42)
        elif c == 'nmf':
            compressor = NMF(n_components=n_components, random_state=42)
        elif c == 'svd':
            compressor = TruncatedSVD(n_components=n_components, random_state=42)
        compressed = compressor.fit_transform(input_x)
        compressed_df = pd.DataFrame(compressed, columns=[f'batting_order_tfidf_{c}_{n}' for n in range(n_components)])
        df_g_compressed = pd.concat([df_g, compressed_df], axis=1)
        df_g_compressed.drop(columns=['batting_order_in_subgameID'], inplace=True)
        df = df.merge(df_g_compressed, on=['subgameID', 'batterTeam'], how='left')

    return df

def create_pitching_order_tfidf(df, n_components=5, compressors=['pca']):
    df_g = df.groupby(['subgameID', 'pitcherTeam'])['pitching_order_in_subgameID'].agg(list).reset_index()
    df_g['pitching_order_in_subgameID'] = df_g['pitching_order_in_subgameID'].map(lambda x: ' '.join(map('order{}'.format, x)))
    vectorizer = TfidfVectorizer()
    input_x = vectorizer.fit_transform(df_g['pitching_order_in_subgameID'].values)
    input_x = pd.DataFrame(input_x.toarray())
    mms = MinMaxScaler()
    input_x = mms.fit_transform(input_x)
    
    for c in compressors:
        if c == 'pca':
            compressor = PCA(n_components=n_components, random_state=42)
        elif c == 'nmf':
            compressor = NMF(n_components=n_components, random_state=42)
        elif c == 'svd':
            compressor = TruncatedSVD(n_components=n_components, random_state=42)
        compressed = compressor.fit_transform(input_x)
        compressed_df = pd.DataFrame(compressed, columns=[f'pitching_order_tfidf_{c}_{n}' for n in range(n_components)])
        df_g_compressed = pd.concat([df_g, compressed_df], axis=1)
        df_g_compressed.drop(columns=['pitching_order_in_subgameID'], inplace=True)
        df = df.merge(df_g_compressed, on=['subgameID', 'pitcherTeam'], how='left')

    return df

def create_batting_order_inning_tfidf(df, n_components=5, compressors=['pca']):
    df_g = df.groupby(['subgameID', 'inning_num'])['batting_order_in_subgameID'].agg(list).reset_index()
    df_g['batting_order_in_subgameID'] = df_g['batting_order_in_subgameID'].map(lambda x: ' '.join(map('order{}'.format, x)))
    vectorizer = TfidfVectorizer()
    input_x = vectorizer.fit_transform(df_g['batting_order_in_subgameID'].values)
    input_x = pd.DataFrame(input_x.toarray())
    mms = MinMaxScaler()
    input_x = mms.fit_transform(input_x)
    
    for c in compressors:
        if c == 'pca':
            compressor = PCA(n_components=n_components, random_state=42)
        elif c == 'nmf':
            compressor = NMF(n_components=n_components, random_state=42)
        elif c == 'svd':
            compressor = TruncatedSVD(n_components=n_components, random_state=42)
        compressed = compressor.fit_transform(input_x)
        compressed_df = pd.DataFrame(compressed, columns=[f'batting_order_inning_tfidf_{c}_{n}' for n in range(n_components)])
        df_g_compressed = pd.concat([df_g, compressed_df], axis=1)
        df_g_compressed.drop(columns=['batting_order_in_subgameID'], inplace=True)
        df = df.merge(df_g_compressed, on=['subgameID', 'inning_num'], how='left')

    return df

def create_pitching_order_inning_tfidf(df, n_components=5, compressors=['pca']):
    df_g = df.groupby(['subgameID', 'inning_num'])['pitching_order_in_subgameID'].agg(list).reset_index()
    df_g['pitching_order_in_subgameID'] = df_g['pitching_order_in_subgameID'].map(lambda x: ' '.join(map('order{}'.format, x)))
    vectorizer = TfidfVectorizer()
    input_x = vectorizer.fit_transform(df_g['pitching_order_in_subgameID'].values)
    input_x = pd.DataFrame(input_x.toarray())
    mms = MinMaxScaler()
    input_x = mms.fit_transform(input_x)
    
    for c in compressors:
        if c == 'pca':
            compressor = PCA(n_components=n_components, random_state=42)
        elif c == 'nmf':
            compressor = NMF(n_components=n_components, random_state=42)
        elif c == 'svd':
            compressor = TruncatedSVD(n_components=n_components, random_state=42)
        compressed = compressor.fit_transform(input_x)
        compressed_df = pd.DataFrame(compressed, columns=[f'pitching_order_inning_tfidf_{c}_{n}' for n in range(n_components)])
        df_g_compressed = pd.concat([df_g, compressed_df], axis=1)
        df_g_compressed.drop(columns=['pitching_order_in_subgameID'], inplace=True)
        df = df.merge(df_g_compressed, on=['subgameID', 'inning_num'], how='left')

    return df

df, sample_submission = read_data(INPUT_DIR)

def create_features(df, sampling_num=5):
    df = create_base_features(df)
    df = fillna_pitcherHand(df)
    df = batter_isPitcher(df)
    df = fillna_batterHand(df)
    
    ### Sampling
    train_df = df[df['test']==0].reset_index(drop=True)
    test_df = df[df['test']==1].reset_index(drop=True)
    train_df = sampling(train_df, sampling_num)
    test_df['subgameID'] = (test_df['gameID'] * 100).astype(float)
    df = pd.concat([train_df, test_df]).reset_index(drop=True)

    ### After Sampling
    df = create_pre_forward_group_features(df, groupby_cols=['subgameID', 'inning_num'], target_col='base_all')
    df = has_9thbottom(df)
    df = has_out(df)

    df = create_pitching_order(df)
    df = create_batting_order(df)
    df = create_pre_forward_group_features(df, groupby_cols=['subgameID', 'batterCommon'], target_col='pitcher')
    df = create_pre_forward_group_features(df, ['subgameID', 'batterTeam'], target_col='pitching_order_in_subgameID')
    df = create_pre_forward_group_features(df, ['subgameID', 'batterTeam'], target_col='batting_order_in_subgameID')
    
    df = create_pitcher_tfidf(df, n_components=30, compressors=['nmf'])
    df = create_pitching_order_tfidf(df, n_components=10, compressors=['nmf'])
    df = create_batting_order_tfidf(df, n_components=10, compressors=['nmf'])
    
    df = create_pitching_order_inning_tfidf(df, n_components=3, compressors=['pca'])
    df = create_batting_order_inning_tfidf(df, n_components=3, compressors=['pca'])

    df['out_cumsum_BS'] = df['BS'] + df['out_cumsum']*(10**2)
    df['out_cumsum_BSO'] = df['BS'] + df['out_cumsum']*(10**3)
    df['out_cumsum_base_all'] = df['base_all'] + df['out_cumsum']*(10**3)
    df = create_pre_forward_group_features(df, groupby_cols=['subgameID', 'inning_num'], target_col='out_cumsum_BS')
    return df


df = create_features(df, sampling_num=CFG.sampling_num)
df.info()

train shape = (20400, 24)
test shape = (33808, 14)
sample_submission shape = (33808, 2)

  0%|          | 0/10 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63728 entries, 0 to 63727
Columns: 160 entries, id to out_cumsum_BS_subgameID_inning_num_diff_forward3
dtypes: bool(3), datetime64[ns](1), float64(107), int64(25), object(24)
memory usage: 77.0+ MB

cat_feats = [c for c in df.columns if df[c].dtype in ['object', 'bool']]

drop_feats = [
'id',
'gameID',
'inning', 
'subgameID',
'pitchType',
'speed',
'ballPositionLabel',
'ballX',
'ballY',
'dir',
'dist',
'battingType',
'isOuts',
'y', 
'test',
'startDayTime',
'startTime',
'pitcher',
'batter',
'bgTop',
'bgBottom',
'place',
'batterHand',
'totalPitchingCount',
]

from sklearn.preprocessing import LabelEncoder

def label_encoding(df, cat_feats):
    labelenc_instances = {}
    df[cat_feats] = df[cat_feats].fillna('nan')
    for c in cat_feats:
        lbl = LabelEncoder()
        df[c] = lbl.fit_transform(df[c].astype(str))
        labelenc_instances[c] = lbl
    return df, labelenc_instances

df, labelenc_instances = label_encoding(df, cat_feats)
print (labelenc_instances.keys())

dict_keys(['b1', 'b2', 'b3', 'pitcher', 'batter', 'batterHand', 'inning', 'pitchType', 'speed', 'ballPositionLabel', 'ballY', 'dir', 'battingType', 'isOuts', 'startTime', 'bottomTeam', 'topTeam', 'date', 'pitcherTeam', 'batterTeam', 'pitcherCommon', 'batterCommon', 'pitcherHand', 'pitcher_subgameID_batterCommon_pre1', 'pitcher_subgameID_batterCommon_pre2', 'pitcher_subgameID_batterCommon_forward1', 'pitcher_subgameID_batterCommon_forward2'])

def agg(df, agg_cols):
    old_cols = list(df.columns)
    for c in tqdm(agg_cols):
        new_feature = '{}_{}_{}'.format('_'.join(c['groupby']), c['agg'], c['target'])
        
        if c['agg'] == 'mean_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('mean') - df[c['target']]
        elif c['agg'] == 'mean_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('mean') / (1+df[c['target']])
        elif c['agg'] == 'median_diff':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('median') - df[c['target']]
        elif c['agg'] == 'median_ratio':
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform('median') / (1+df[c['target']])
        else:
            df[new_feature] = df.groupby(c['groupby'])[c['target']].transform(c['agg'])

    new_cols = list(set(list(df.columns)) - set(old_cols))
    return df, new_cols

def create_agg_feature(df, groupby_cols, target_cols, aggs):
    agg_cols = []
    for g in groupby_cols:
        for t in target_cols:
            for a in aggs:
                agg_d = {}
                agg_d['groupby'] = g
                agg_d['target'] = t
                agg_d['agg'] = a
                agg_cols.append(agg_d)

    df, new_cols = agg(df, agg_cols)

    return df, new_cols

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

groupby_cols = [
    ['subgameID', 'pitcherCommon'],
    ['subgameID', 'batterCommon'],
    ['subgameID', 'pitcherHand', 'batterHand'],
]
target_cols = [
    'b1', 
    'b2', 
    'b3', 
    'totalPitchingCount',
]
aggs = [
    'mean',
    'std',
    'skew',
    'median',
    'mean_diff',
    'mean_ratio',
]
df, new_cols = create_agg_feature(df, groupby_cols, target_cols, aggs)

input_x = df[new_cols].fillna(0)
mms = MinMaxScaler()
input_x = mms.fit_transform(input_x)
n_components = 20
pca = PCA(n_components=n_components, random_state=42)
transformed = pca.fit_transform(input_x)
pca_df = pd.DataFrame(transformed, columns=[f'pca1_{n}' for n in range(n_components)])
df = pd.concat([df, pca_df], axis=1)

from scipy.stats import ks_2samp
diff_feats = []
for c in new_cols:
    d1 = df[df['test']==0][c].values
    d2 = df[df['test']==1][c].values
    s = ks_2samp(d1, d2).statistic
    if s > 0.03:
        diff_feats.append(c)

for c in diff_feats:
    if not c in drop_feats:
        drop_feats.append(c)

        
        
groupby_cols = [
    ['batterCommon',],
]
target_cols = [
    'batting_order_in_subgameID'
]
aggs = [
    'mean',
    'std',
    'skew',
    'median',
    'mean_diff',
    'mean_ratio',
    'median_diff',
]
df, new_cols = create_agg_feature(df, groupby_cols, target_cols, aggs)


input_x = df[new_cols].fillna(0)
mms = MinMaxScaler()
input_x = mms.fit_transform(input_x)
n_components = 3
pca = PCA(n_components=n_components, random_state=42)
transformed = pca.fit_transform(input_x)
pca_df = pd.DataFrame(transformed, columns=[f'pca2_{n}' for n in range(n_components)])
df = pd.concat([df, pca_df], axis=1)

df.info()

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63728 entries, 0 to 63727
Columns: 262 entries, id to pca2_2
dtypes: datetime64[ns](1), float64(209), int64(52)
memory usage: 127.9 MB

train_df = df[df['test']==0].reset_index(drop=True)
test_df = df[df['test']==1].reset_index(drop=True)
print ('train_df.shape={}, test_df.shape={}'.format(train_df.shape, test_df.shape))

train_df.shape=(29920, 262), test_df.shape=(33808, 262)

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

class RandomGroupKFold:
    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = groups.unique()
        for tr_group_idx, va_group_idx in kf.split(unique_ids):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(groups.isin(tr_group))[0]
            val_idx = np.where(groups.isin(va_group))[0]
            yield train_idx, val_idx


def create_folds(df, how_split, seeds, n_splits, target_col, group_col):
    for seed in seeds:
        df[f'fold_{seed}'] = 9999
        if how_split == 'KFold':
            kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
            for fold, (_, valid_idx) in enumerate(kf.split(df)):
                df.loc[df.iloc[valid_idx].index, f'fold_{seed}'] = fold
                
        elif how_split == 'RandomGroupKFold':
            kf = RandomGroupKFold(n_splits=n_splits, random_state=seed)
            for fold, (_, valid_idx) in enumerate(kf.split(df, df[target_col], df[group_col])):
                df.loc[df.iloc[valid_idx].index, f'fold_{seed}'] = fold
             
    return df



train_df = create_folds(
    df=train_df,
    how_split=CFG.how_split,
    seeds=CFG.seeds, 
    n_splits=CFG.n_splits, 
    target_col=CFG.target, 
    group_col=CFG.group_col
)
train_df

	id	totalPitchingCount	B	S	O	b1	b2	b3	pitcher	batter	batterHand	gameID	inning	pitchType	speed	ballPositionLabel	ballX	ballY	dir	dist	battingType	isOuts	y	test	startTime	bottomTeam	bgBottom	topTeam	place	startDayTime	bgTop	date	BS	BSO	inning_num	inning_top	inning_num_half	out_cumsum	pitcherTeam	batterTeam	pitcherCommon	batterCommon	base_all	pitcherHand	batter_isPitcher	subgameID	base_all_subgameID_inning_num_pre1	base_all_subgameID_inning_num_pre2	base_all_subgameID_inning_num_forward1	base_all_subgameID_inning_num_forward2	base_all_subgameID_inning_num_diff_pre1	base_all_subgameID_inning_num_diff_pre2	base_all_subgameID_inning_num_diff_pre3	base_all_subgameID_inning_num_diff_forward1	base_all_subgameID_inning_num_diff_forward2	base_all_subgameID_inning_num_diff_forward3	has_9thbottom	has_out2	has_out1	pitching_order_in_subgameID	batting_order_in_subgameID	pitcher_subgameID_batterCommon_pre1	pitcher_subgameID_batterCommon_pre2	pitcher_subgameID_batterCommon_forward1	pitcher_subgameID_batterCommon_forward2	pitcher_subgameID_batterCommon_diff_pre1	pitcher_subgameID_batterCommon_diff_pre2	pitcher_subgameID_batterCommon_diff_pre3	pitcher_subgameID_batterCommon_diff_forward1	pitcher_subgameID_batterCommon_diff_forward2	pitcher_subgameID_batterCommon_diff_forward3	pitching_order_in_subgameID_subgameID_batterTeam_pre1	pitching_order_in_subgameID_subgameID_batterTeam_pre2	pitching_order_in_subgameID_subgameID_batterTeam_forward1	pitching_order_in_subgameID_subgameID_batterTeam_forward2	pitching_order_in_subgameID_subgameID_batterTeam_diff_pre1	pitching_order_in_subgameID_subgameID_batterTeam_diff_pre2	pitching_order_in_subgameID_subgameID_batterTeam_diff_pre3	pitching_order_in_subgameID_subgameID_batterTeam_diff_forward1	pitching_order_in_subgameID_subgameID_batterTeam_diff_forward2	pitching_order_in_subgameID_subgameID_batterTeam_diff_forward3	batting_order_in_subgameID_subgameID_batterTeam_pre1	batting_order_in_subgameID_subgameID_batterTeam_pre2	batting_order_in_subgameID_subgameID_batterTeam_forward1	batting_order_in_subgameID_subgameID_batterTeam_forward2	batting_order_in_subgameID_subgameID_batterTeam_diff_pre1	batting_order_in_subgameID_subgameID_batterTeam_diff_pre2	batting_order_in_subgameID_subgameID_batterTeam_diff_pre3	batting_order_in_subgameID_subgameID_batterTeam_diff_forward1	batting_order_in_subgameID_subgameID_batterTeam_diff_forward2	batting_order_in_subgameID_subgameID_batterTeam_diff_forward3	pitcher_tfidf_nmf_0	pitcher_tfidf_nmf_1	pitcher_tfidf_nmf_2	pitcher_tfidf_nmf_3	pitcher_tfidf_nmf_4	pitcher_tfidf_nmf_5	pitcher_tfidf_nmf_6	pitcher_tfidf_nmf_7	pitcher_tfidf_nmf_8	pitcher_tfidf_nmf_9	pitcher_tfidf_nmf_10	pitcher_tfidf_nmf_11	pitcher_tfidf_nmf_12	pitcher_tfidf_nmf_13	pitcher_tfidf_nmf_14	pitcher_tfidf_nmf_15	pitcher_tfidf_nmf_16	pitcher_tfidf_nmf_17	pitcher_tfidf_nmf_18	pitcher_tfidf_nmf_19	pitcher_tfidf_nmf_20	pitcher_tfidf_nmf_21	pitcher_tfidf_nmf_22	pitcher_tfidf_nmf_23	pitcher_tfidf_nmf_24	pitcher_tfidf_nmf_25	pitcher_tfidf_nmf_26	pitcher_tfidf_nmf_27	pitcher_tfidf_nmf_28	pitcher_tfidf_nmf_29	pitching_order_tfidf_nmf_0	pitching_order_tfidf_nmf_1	pitching_order_tfidf_nmf_2	pitching_order_tfidf_nmf_3	pitching_order_tfidf_nmf_4	pitching_order_tfidf_nmf_5	pitching_order_tfidf_nmf_6	pitching_order_tfidf_nmf_7	pitching_order_tfidf_nmf_8	pitching_order_tfidf_nmf_9	batting_order_tfidf_nmf_0	batting_order_tfidf_nmf_1	batting_order_tfidf_nmf_2	batting_order_tfidf_nmf_3	batting_order_tfidf_nmf_4	batting_order_tfidf_nmf_5	batting_order_tfidf_nmf_6	batting_order_tfidf_nmf_7	batting_order_tfidf_nmf_8	batting_order_tfidf_nmf_9	pitching_order_inning_tfidf_pca_0	pitching_order_inning_tfidf_pca_1	pitching_order_inning_tfidf_pca_2	batting_order_inning_tfidf_pca_0	batting_order_inning_tfidf_pca_1	batting_order_inning_tfidf_pca_2	out_cumsum_BS	out_cumsum_BSO	out_cumsum_base_all	out_cumsum_BS_subgameID_inning_num_pre1	out_cumsum_BS_subgameID_inning_num_pre2	out_cumsum_BS_subgameID_inning_num_forward1	out_cumsum_BS_subgameID_inning_num_forward2	out_cumsum_BS_subgameID_inning_num_diff_pre1	out_cumsum_BS_subgameID_inning_num_diff_pre2	out_cumsum_BS_subgameID_inning_num_diff_pre3	out_cumsum_BS_subgameID_inning_num_diff_forward1	out_cumsum_BS_subgameID_inning_num_diff_forward2	out_cumsum_BS_subgameID_inning_num_diff_forward3	subgameID_pitcherCommon_mean_b1	subgameID_pitcherCommon_std_b1	subgameID_pitcherCommon_skew_b1	subgameID_pitcherCommon_median_b1	subgameID_pitcherCommon_mean_diff_b1	subgameID_pitcherCommon_mean_ratio_b1	subgameID_pitcherCommon_mean_b2	subgameID_pitcherCommon_std_b2	subgameID_pitcherCommon_skew_b2	subgameID_pitcherCommon_median_b2	subgameID_pitcherCommon_mean_diff_b2	subgameID_pitcherCommon_mean_ratio_b2	subgameID_pitcherCommon_mean_b3	subgameID_pitcherCommon_std_b3	subgameID_pitcherCommon_skew_b3	subgameID_pitcherCommon_median_b3	subgameID_pitcherCommon_mean_diff_b3	subgameID_pitcherCommon_mean_ratio_b3	subgameID_pitcherCommon_mean_totalPitchingCount	subgameID_pitcherCommon_std_totalPitchingCount	subgameID_pitcherCommon_skew_totalPitchingCount	subgameID_pitcherCommon_median_totalPitchingCount	subgameID_pitcherCommon_mean_diff_totalPitchingCount	subgameID_pitcherCommon_mean_ratio_totalPitchingCount	subgameID_batterCommon_mean_b1	subgameID_batterCommon_std_b1	subgameID_batterCommon_skew_b1	subgameID_batterCommon_median_b1	subgameID_batterCommon_mean_diff_b1	subgameID_batterCommon_mean_ratio_b1	subgameID_batterCommon_mean_b2	subgameID_batterCommon_std_b2	subgameID_batterCommon_skew_b2	subgameID_batterCommon_median_b2	subgameID_batterCommon_mean_diff_b2	subgameID_batterCommon_mean_ratio_b2	subgameID_batterCommon_mean_b3	subgameID_batterCommon_std_b3	subgameID_batterCommon_skew_b3	subgameID_batterCommon_median_b3	subgameID_batterCommon_mean_diff_b3	subgameID_batterCommon_mean_ratio_b3	subgameID_batterCommon_mean_totalPitchingCount	subgameID_batterCommon_std_totalPitchingCount	subgameID_batterCommon_skew_totalPitchingCount	subgameID_batterCommon_median_totalPitchingCount	subgameID_batterCommon_mean_diff_totalPitchingCount	subgameID_batterCommon_mean_ratio_totalPitchingCount	subgameID_pitcherHand_batterHand_mean_b1	subgameID_pitcherHand_batterHand_std_b1	subgameID_pitcherHand_batterHand_skew_b1	subgameID_pitcherHand_batterHand_median_b1	subgameID_pitcherHand_batterHand_mean_diff_b1	subgameID_pitcherHand_batterHand_mean_ratio_b1	subgameID_pitcherHand_batterHand_mean_b2	subgameID_pitcherHand_batterHand_std_b2	subgameID_pitcherHand_batterHand_skew_b2	subgameID_pitcherHand_batterHand_median_b2	subgameID_pitcherHand_batterHand_mean_diff_b2	subgameID_pitcherHand_batterHand_mean_ratio_b2	subgameID_pitcherHand_batterHand_mean_b3	subgameID_pitcherHand_batterHand_std_b3	subgameID_pitcherHand_batterHand_skew_b3	subgameID_pitcherHand_batterHand_median_b3	subgameID_pitcherHand_batterHand_mean_diff_b3	subgameID_pitcherHand_batterHand_mean_ratio_b3	subgameID_pitcherHand_batterHand_mean_totalPitchingCount	subgameID_pitcherHand_batterHand_std_totalPitchingCount	subgameID_pitcherHand_batterHand_skew_totalPitchingCount	subgameID_pitcherHand_batterHand_median_totalPitchingCount	subgameID_pitcherHand_batterHand_mean_diff_totalPitchingCount	subgameID_pitcherHand_batterHand_mean_ratio_totalPitchingCount	pca1_0	pca1_1	pca1_2	pca1_3	pca1_4	pca1_5	pca1_6	pca1_7	pca1_8	pca1_9	pca1_10	pca1_11	pca1_12	pca1_13	pca1_14	pca1_15	pca1_16	pca1_17	pca1_18	pca1_19	batterCommon_mean_batting_order_in_subgameID	batterCommon_std_batting_order_in_subgameID	batterCommon_skew_batting_order_in_subgameID	batterCommon_median_batting_order_in_subgameID	batterCommon_mean_diff_batting_order_in_subgameID	batterCommon_mean_ratio_batting_order_in_subgameID	batterCommon_median_diff_batting_order_in_subgameID	pca2_0	pca2_1	pca2_2	fold_2021	fold_2022	fold_2023	fold_2024	fold_2025
0	16274	1	0	0	0	0	0	0	326	55	1	20202116	0	6	48	8	1.0	9	26	NaN	5	2	0.0	0	5	10	7	1	2	2020-06-30 18:00:00	11	9	0	0	1.0	1	0.0	-3.0	10	1	192	49	0	1	0.0	2.020212e+11	NaN	NaN	0.0	0.0	NaN	NaN	NaN	0.0	0.0	0.0	0	1	1	1	1	0	0	325	0	0	0	0	0	0	0	NaN	NaN	1.0	1.0	NaN	NaN	NaN	0.0	0.0	0.0	NaN	NaN	2.0	3.0	NaN	NaN	NaN	-1.0	-2.0	-1.0	0.0	0.0	0.0	0.312684	0.0	0.00000	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.379715	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.000036	0.0	0.000016	0.0	0.0	0.377299	0.091242	0.671743	0.419980	0.105919	0.065265	0.000000	0.067563	0.085841	0.079291	0.048873	0.125847	0.120079	0.032198	0.102958	-0.416435	0.003972	-0.012131	-0.665798	-0.377505	-0.071522	-300.0	-3000.0	-3000.0	NaN	NaN	-199.0	-100.0	NaN	NaN	NaN	-101.0	-200.0	-99.0	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	2.166667	1.424574	1.045316	2.0	1.166667	1.083333	0.000000	0.00000	NaN	0.0	0.000000	0.000000	0.000000	0.00000	NaN	0.0	0.000000	0.000000	0.000000	0.00000	NaN	0.0	0.000000	0.000000	2.500000	2.121320	NaN	2.5	1.500000	1.250000	0.210526	0.418854	1.544832	0.0	0.210526	0.210526	0.105263	0.315302	2.798440	0.0	0.105263	0.105263	0.105263	0.315302	2.798440	0.0	0.105263	0.105263	2.157895	1.708253	2.491734	2.0	1.157895	1.078947	-0.644311	0.126719	0.359370	-0.008018	-0.264578	0.309850	0.173553	0.056066	-0.196114	-0.055497	0.011138	-0.193440	-0.013806	0.055583	0.082012	-0.100059	-0.139652	-0.105892	-0.037840	-0.070791	3.948571	2.589971	0.512757	4.0	2.948571	1.974286	3.0	-0.013972	0.314364	0.056629	4	1	0	4	3
1	16279	2	1	0	1	0	0	0	326	142	1	20202116	0	6	48	9	8.0	3	26	NaN	5	2	1.0	0	5	10	7	1	2	2020-06-30 18:00:00	11	9	1	101	1.0	1	0.0	-2.0	10	1	192	104	0	1	0.0	2.020212e+11	0.0	NaN	0.0	NaN	0.0	NaN	NaN	0.0	NaN	NaN	0	1	1	1	2	0	0	325	321	0	0	0	0	0	0	1.0	NaN	1.0	1.0	0.0	NaN	NaN	0.0	0.0	0.0	1.0	NaN	3.0	4.0	1.0	NaN	NaN	-1.0	-2.0	-1.0	0.0	0.0	0.0	0.312684	0.0	0.00000	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.379715	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.000036	0.0	0.000016	0.0	0.0	0.377299	0.091242	0.671743	0.419980	0.105919	0.065265	0.000000	0.067563	0.085841	0.079291	0.048873	0.125847	0.120079	0.032198	0.102958	-0.416435	0.003972	-0.012131	-0.665798	-0.377505	-0.071522	-199.0	-1999.0	-2000.0	-300.0	NaN	-100.0	NaN	101.0	NaN	NaN	-99.0	NaN	NaN	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	2.166667	1.424574	1.045316	2.0	0.166667	0.722222	0.250000	0.50000	2.000000	0.0	0.250000	0.250000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.250000	0.50000	2.000000	0.0	0.250000	0.250000	2.000000	0.000000	0.000000	2.0	0.000000	0.666667	0.210526	0.418854	1.544832	0.0	0.210526	0.210526	0.105263	0.315302	2.798440	0.0	0.105263	0.105263	0.105263	0.315302	2.798440	0.0	0.105263	0.105263	2.157895	1.708253	2.491734	2.0	0.157895	0.719298	0.022471	-0.115130	0.580655	-0.785461	-0.198672	-0.000363	0.009649	0.055541	0.113648	-0.131829	-0.134186	0.010176	0.081437	-0.110041	-0.010822	-0.017334	-0.358337	-0.029248	-0.131575	-0.143174	5.296588	2.519130	-0.257588	6.0	3.296588	1.765529	4.0	0.253080	0.286711	0.044343	4	1	0	4	3
2	16281	1	0	0	2	0	0	0	326	105	0	20202116	0	3	24	5	7.0	7	26	NaN	5	2	0.0	0	5	10	7	1	2	2020-06-30 18:00:00	11	9	0	200	1.0	1	0.0	-1.0	10	1	192	79	0	1	0.0	2.020212e+11	0.0	0.0	NaN	NaN	0.0	0.0	0.0	NaN	NaN	NaN	0	1	1	1	3	0	0	325	321	0	0	0	0	0	0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	2.0	1.0	4.0	5.0	1.0	2.0	1.0	-1.0	-2.0	-1.0	0.0	0.0	0.0	0.312684	0.0	0.00000	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.379715	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.000036	0.0	0.000016	0.0	0.0	0.377299	0.091242	0.671743	0.419980	0.105919	0.065265	0.000000	0.067563	0.085841	0.079291	0.048873	0.125847	0.120079	0.032198	0.102958	-0.416435	0.003972	-0.012131	-0.665798	-0.377505	-0.071522	-100.0	-1000.0	-1000.0	-199.0	-300.0	NaN	NaN	99.0	200.0	101.0	NaN	NaN	NaN	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	0.055556	0.235702	4.242641	0.0	0.055556	0.055556	2.166667	1.424574	1.045316	2.0	1.166667	1.083333	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	2.000000	2.000000	2.000000	1.0	1.000000	1.000000	0.263158	0.452414	1.170193	0.0	0.263158	0.263158	0.105263	0.315302	2.798440	0.0	0.105263	0.105263	0.052632	0.229416	4.358899	0.0	0.052632	0.052632	2.421053	1.538968	0.523510	2.0	1.421053	1.210526	-0.636816	0.121459	0.354935	-0.017263	-0.300365	0.270106	0.159194	0.041874	-0.220838	-0.030071	-0.017451	-0.361946	-0.112844	0.034198	-0.273963	-0.152709	-0.064833	-0.219234	-0.118912	-0.074632	3.767361	2.265364	1.318728	3.0	0.767361	0.941840	0.0	-0.193972	0.057481	-0.047602	4	1	0	4	3
3	16293	4	0	2	0	0	0	0	0	374	0	20202116	1	7	23	9	21.0	3	3	25.3	2	1	3.0	0	5	10	7	1	2	2020-06-30 18:00:00	11	9	20	20	2.0	0	1.0	0.0	1	10	12	224	0	0	0.0	2.020212e+11	NaN	NaN	0.0	0.0	NaN	NaN	NaN	0.0	0.0	0.0	0	1	1	1	1	0	0	1	264	0	0	0	0	1	1	NaN	NaN	1.0	1.0	NaN	NaN	NaN	0.0	0.0	0.0	NaN	NaN	2.0	3.0	NaN	NaN	NaN	-1.0	-2.0	-1.0	0.0	0.0	0.0	0.000000	0.0	0.00000	0.0	0.0	0.000000	0.0	0.0	0.361212	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.627983	0.0	0.000027	0.0	0.0	0.449392	0.381999	0.025985	0.546375	0.137803	0.078208	0.000000	0.073848	0.051464	0.052648	0.041174	0.138234	0.130827	0.077219	0.089527	-0.416435	0.003972	-0.012131	-0.665798	-0.377505	-0.071522	20.0	20.0	0.0	NaN	NaN	121.0	221.0	NaN	NaN	NaN	-101.0	-201.0	-100.0	0.166667	0.389249	2.055237	0.0	0.166667	0.166667	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.916667	1.164500	-0.639975	3.0	-1.083333	0.583333	0.333333	0.57735	1.732051	0.0	0.333333	0.333333	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	3.000000	1.000000	0.000000	3.0	-1.000000	0.600000	0.142857	0.377964	2.645751	0.0	0.142857	0.142857	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	3.142857	0.899735	-0.353045	3.0	-0.857143	0.628571	-0.293898	-0.682720	-0.372114	-0.275994	-0.025979	-0.289059	-0.249626	-0.158058	0.358194	-0.393992	0.101684	0.149978	0.134632	-0.000190	0.068536	0.242916	0.194148	0.001904	0.020913	0.189994	2.750000	2.944169	1.202660	1.0	1.750000	1.375000	0.0	-0.356571	0.176083	0.130036	4	1	0	4	3
4	16297	4	1	2	1	0	0	0	0	274	0	20202116	1	6	44	7	21.0	4	26	NaN	5	2	2.0	0	5	10	7	1	2	2020-06-30 18:00:00	11	9	21	121	2.0	0	1.0	1.0	1	10	12	175	0	0	0.0	2.020212e+11	0.0	NaN	0.0	NaN	0.0	NaN	NaN	0.0	NaN	NaN	0	1	1	1	2	0	0	1	264	0	0	0	0	1	1	1.0	NaN	1.0	1.0	0.0	NaN	NaN	0.0	0.0	0.0	1.0	NaN	3.0	4.0	1.0	NaN	NaN	-1.0	-2.0	-1.0	0.0	0.0	0.0	0.000000	0.0	0.00000	0.0	0.0	0.000000	0.0	0.0	0.361212	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.627983	0.0	0.000027	0.0	0.0	0.449392	0.381999	0.025985	0.546375	0.137803	0.078208	0.000000	0.073848	0.051464	0.052648	0.041174	0.138234	0.130827	0.077219	0.089527	-0.416435	0.003972	-0.012131	-0.665798	-0.377505	-0.071522	121.0	1021.0	1000.0	20.0	NaN	221.0	NaN	101.0	NaN	NaN	-100.0	NaN	NaN	0.166667	0.389249	2.055237	0.0	0.166667	0.166667	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.916667	1.164500	-0.639975	3.0	-1.083333	0.583333	0.250000	0.50000	2.000000	0.0	0.250000	0.250000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	2.750000	1.500000	-0.370370	3.0	-1.250000	0.550000	0.142857	0.377964	2.645751	0.0	0.142857	0.142857	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	3.142857	0.899735	-0.353045	3.0	-0.857143	0.628571	-0.386338	-0.594067	-0.347844	-0.226383	0.012064	-0.262602	-0.269799	-0.190480	0.328151	-0.368785	0.083654	0.184253	0.157452	0.016523	0.120979	0.254351	0.182308	-0.010740	0.020615	0.179231	2.809741	2.179176	1.824723	2.0	0.809741	0.936580	0.0	-0.350097	0.079449	-0.078957	4	1	0	4	3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
29915	870	3	0	2	0	0	0	0	3	54	1	20202175	15	7	41	8	6.0	10	7	36.4	2	1	3.0	0	5	6	1	11	5	2020-06-19 18:00:00	5	0	20	20	16.0	0	8.0	21.0	11	6	15	48	0	1	0.0	2.020218e+11	NaN	NaN	1.0	1.0	NaN	NaN	NaN	-1.0	-1.0	0.0	0	1	1	3	5	280	244	0	0	1	1	0	0	0	0	2.0	2.0	3.0	3.0	1.0	1.0	0.0	0.0	0.0	0.0	4.0	9.0	11.0	12.0	1.0	-4.0	-5.0	-6.0	-7.0	-1.0	0.0	0.0	0.0	0.000000	0.0	0.00264	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.282229	0.0	0.0	0.001583	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000709	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.582731	0.154128	0.551032	0.000000	0.129537	0.113649	0.125419	0.012137	0.097306	0.057493	0.133505	0.093416	0.044535	0.081177	0.043619	0.737536	-0.633337	-0.518812	0.161832	0.021083	0.129890	2120.0	21020.0	21000.0	NaN	NaN	2210.0	2301.0	NaN	NaN	NaN	-90.0	-181.0	-91.0	0.666667	0.577350	-1.732051	1.0	0.666667	0.666667	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.333333	0.577350	1.732051	2.0	-0.666667	0.583333	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.333333	0.57735	1.732051	0.0	0.333333	0.333333	0.333333	0.57735	1.732051	0.0	0.333333	0.333333	3.333333	0.577350	1.732051	3.0	0.333333	0.833333	0.266667	0.457738	1.176354	0.0	0.266667	0.266667	0.066667	0.258199	3.872983	0.0	0.066667	0.066667	0.066667	0.258199	3.872983	0.0	0.066667	0.066667	2.600000	1.549193	0.915046	2.0	-0.400000	0.650000	0.331927	0.525413	0.145993	0.103222	0.685585	-0.883314	0.061444	-1.154701	-0.434544	0.169067	0.024685	0.025687	-0.109969	-0.525101	-0.276035	-0.008975	-0.202277	-0.006182	-0.122699	-0.193467	5.640704	1.969299	0.791253	5.0	0.640704	0.940117	0.0	0.075982	0.009889	-0.105958	1	1	1	1	4
29916	878	2	0	1	1	1	0	0	3	366	0	20202175	15	6	49	6	7.0	1	16	40.6	3	1	3.0	0	5	6	1	11	5	2020-06-19 18:00:00	5	0	10	110	16.0	0	8.0	22.0	11	6	15	219	1	1	0.0	2.020218e+11	0.0	NaN	1.0	NaN	1.0	NaN	NaN	0.0	NaN	NaN	0	1	1	3	11	0	0	0	0	0	0	0	0	0	0	3.0	2.0	3.0	NaN	0.0	1.0	1.0	0.0	NaN	NaN	5.0	4.0	12.0	NaN	6.0	7.0	1.0	-1.0	NaN	NaN	0.0	0.0	0.0	0.000000	0.0	0.00264	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.282229	0.0	0.0	0.001583	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000709	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.582731	0.154128	0.551032	0.000000	0.129537	0.113649	0.125419	0.012137	0.097306	0.057493	0.133505	0.093416	0.044535	0.081177	0.043619	0.737536	-0.633337	-0.518812	0.161832	0.021083	0.129890	2210.0	22010.0	22001.0	2120.0	NaN	2301.0	NaN	90.0	NaN	NaN	-91.0	NaN	NaN	0.666667	0.577350	-1.732051	1.0	-0.333333	0.333333	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.333333	0.577350	1.732051	2.0	0.333333	0.777778	1.000000	NaN	NaN	1.0	0.000000	0.500000	0.000000	NaN	NaN	0.0	0.000000	0.000000	0.000000	NaN	NaN	0.0	0.000000	0.000000	2.000000	NaN	NaN	2.0	0.000000	0.666667	0.357143	0.487950	0.630582	0.0	-0.642857	0.178571	0.071429	0.262265	3.519631	0.0	0.071429	0.071429	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.607143	1.314852	0.696802	2.0	0.607143	0.869048	0.406377	-1.178263	-0.178429	0.841705	0.737424	0.083415	0.372132	-0.616314	-0.039892	0.089380	0.191970	-0.061905	-0.191084	-0.100138	-0.016807	-0.155647	-0.134428	0.131957	-0.132521	-0.164374	8.514019	3.505695	-0.881026	10.0	-2.485981	0.709502	-1.0	0.658098	-0.209597	0.244050	1	1	1	1	4
29917	880	2	1	0	2	1	0	0	3	280	1	20202175	15	6	50	11	9.0	1	26	NaN	5	2	2.0	0	5	6	1	11	5	2020-06-19 18:00:00	5	0	1	201	16.0	0	8.0	23.0	11	6	15	177	1	1	0.0	2.020218e+11	1.0	0.0	NaN	NaN	0.0	1.0	1.0	NaN	NaN	NaN	0	1	1	3	12	0	0	0	0	0	0	0	0	0	0	3.0	3.0	NaN	NaN	0.0	0.0	0.0	NaN	NaN	NaN	11.0	5.0	NaN	NaN	1.0	7.0	6.0	NaN	NaN	NaN	0.0	0.0	0.0	0.000000	0.0	0.00264	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.282229	0.0	0.0	0.001583	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.0	0.000709	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.582731	0.154128	0.551032	0.000000	0.129537	0.113649	0.125419	0.012137	0.097306	0.057493	0.133505	0.093416	0.044535	0.081177	0.043619	0.737536	-0.633337	-0.518812	0.161832	0.021083	0.129890	2301.0	23001.0	23001.0	2210.0	2120.0	NaN	NaN	91.0	181.0	90.0	NaN	NaN	NaN	0.666667	0.577350	-1.732051	1.0	-0.333333	0.333333	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.333333	0.577350	1.732051	2.0	0.333333	0.777778	1.000000	NaN	NaN	1.0	0.000000	0.500000	0.000000	NaN	NaN	0.0	0.000000	0.000000	0.000000	NaN	NaN	0.0	0.000000	0.000000	2.000000	NaN	NaN	2.0	0.000000	0.666667	0.266667	0.457738	1.176354	0.0	-0.733333	0.133333	0.066667	0.258199	3.872983	0.0	0.066667	0.066667	0.066667	0.258199	3.872983	0.0	0.066667	0.066667	2.600000	1.549193	0.915046	2.0	0.600000	0.866667	0.450837	-1.115967	-0.002437	0.769054	0.722206	0.129326	0.454946	-0.441497	-0.448458	0.030482	0.111036	-0.055247	-0.226670	-0.457149	-0.005288	-0.110743	-0.036442	0.105414	-0.093453	-0.077150	7.036145	2.047590	0.868201	7.0	-4.963855	0.541242	-5.0	0.214718	-0.370890	-0.079173	1	1	1	1	4
29918	886	3	1	1	0	1	0	0	20	27	0	20202175	16	6	54	8	16.0	7	5	33.8	2	1	3.0	0	5	6	1	11	5	2020-06-19 18:00:00	5	0	11	11	17.0	1	8.0	21.0	6	11	28	31	1	1	0.0	2.020218e+11	NaN	NaN	0.0	NaN	NaN	NaN	NaN	1.0	NaN	NaN	0	1	0	3	4	270	236	0	0	1	1	0	0	0	0	2.0	2.0	3.0	NaN	1.0	1.0	0.0	0.0	NaN	NaN	2.0	1.0	5.0	NaN	2.0	3.0	1.0	-1.0	NaN	NaN	0.0	0.0	0.0	0.000000	0.0	0.00000	0.0	0.0	0.001143	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.605946	0.0	0.0	0.0	0.000000	0.000225	0.295174	0.0	0.000000	0.0	0.0	0.642679	0.140488	0.304436	0.000000	0.078729	0.000000	0.000000	0.124228	0.084934	0.078022	0.054950	0.125196	0.002976	0.076973	0.130165	0.737536	-0.633337	-0.518812	0.700592	-0.375337	0.026196	2111.0	21011.0	21001.0	NaN	NaN	2321.0	NaN	NaN	NaN	NaN	-210.0	NaN	NaN	0.500000	0.707107	NaN	0.5	-0.500000	0.250000	0.000000	0.000000	NaN	0.0	0.000000	0.000000	0.000000	0.000000	NaN	0.0	0.000000	0.000000	4.500000	2.121320	NaN	4.5	1.500000	1.125000	0.750000	0.50000	-2.000000	1.0	-0.250000	0.375000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	2.250000	0.957427	-0.854563	2.5	-0.750000	0.562500	0.357143	0.487950	0.630582	0.0	-0.642857	0.178571	0.071429	0.262265	3.519631	0.0	0.071429	0.071429	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.607143	1.314852	0.696802	2.0	-0.392857	0.651786	0.495186	-1.298038	-0.252374	0.487661	0.379406	0.211701	0.202830	-0.465027	0.043380	0.080003	-0.014064	0.088538	-0.068858	0.049522	0.201122	0.096353	0.089836	0.161669	-0.084881	-0.182615	4.809917	1.951011	1.671194	4.0	0.809917	0.961983	0.0	-0.105838	0.042613	-0.144475	1	1	1	1	4
29919	892	6	1	2	2	0	0	0	20	315	0	20202175	16	3	35	4	2.0	5	26	NaN	5	2	0.0	0	5	6	1	11	5	2020-06-19 18:00:00	5	0	21	221	17.0	1	8.0	23.0	6	11	28	192	0	1	0.0	2.020218e+11	1.0	NaN	NaN	NaN	-1.0	NaN	NaN	NaN	NaN	NaN	0	1	0	3	5	270	236	0	0	1	1	0	0	0	0	3.0	2.0	NaN	NaN	0.0	1.0	1.0	NaN	NaN	NaN	4.0	2.0	NaN	NaN	1.0	3.0	2.0	NaN	NaN	NaN	0.0	0.0	0.0	0.000000	0.0	0.00000	0.0	0.0	0.001143	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.605946	0.0	0.0	0.0	0.000000	0.000225	0.295174	0.0	0.000000	0.0	0.0	0.642679	0.140488	0.304436	0.000000	0.078729	0.000000	0.000000	0.124228	0.084934	0.078022	0.054950	0.125196	0.002976	0.076973	0.130165	0.737536	-0.633337	-0.518812	0.700592	-0.375337	0.026196	2321.0	23021.0	23000.0	2111.0	NaN	NaN	NaN	210.0	NaN	NaN	NaN	NaN	NaN	0.500000	0.707107	NaN	0.5	0.500000	0.500000	0.000000	0.000000	NaN	0.0	0.000000	0.000000	0.000000	0.000000	NaN	0.0	0.000000	0.000000	4.500000	2.121320	NaN	4.5	-1.500000	0.642857	0.333333	0.57735	1.732051	0.0	0.333333	0.333333	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.0	0.000000	0.000000	3.666667	2.081666	1.293343	3.0	-2.333333	0.523810	0.357143	0.487950	0.630582	0.0	0.357143	0.357143	0.071429	0.262265	3.519631	0.0	0.071429	0.071429	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	2.607143	1.314852	0.696802	2.0	-3.392857	0.372449	0.024474	-0.738453	-0.101643	0.249932	-0.097679	-0.813341	-0.198979	-0.552263	0.190887	-0.418871	-0.094543	0.203682	0.047874	-0.018201	-0.269995	-0.067695	0.149479	0.160411	-0.072224	-0.131935	5.380952	2.102252	1.347979	5.0	0.380952	0.896825	0.0	0.011746	0.005797	-0.109157	1	1	1	1	4

29920 rows × 267 columns

oof_preds_set = []
test_preds_set = []
feature_importance = pd.DataFrame()

##############################################
# LightGBM
##############################################

import lightgbm as lgb
import logging
from sklearn.metrics import f1_score

def get_score(y_true, y_pred):
    score= {}
    score['f1'] = round(f1_score(y_true, y_pred, average='macro'), 5)
    return score

def feval_f1(y_true, y_pred):
    y_pred = np.argmax(y_pred.reshape(CFG.num_class,-1), axis=0)
    return 'f1_macro', f1_score(y_true, y_pred, average='macro'), True


train_feats = [f for f in df.columns if f not in drop_feats]

oof_preds = np.zeros((len(train_df), CFG.num_class)).astype(np.float32)
test_preds = np.zeros((len(test_df), CFG.num_class)).astype(np.float32)

for seed in tqdm(CFG.seeds):
    for fold in range(CFG.n_splits):
        train_idx = train_df[train_df[f'fold_{seed}']!=fold].index
        valid_idx = train_df[train_df[f'fold_{seed}']==fold].index

        train_x, train_y = train_df.loc[train_idx], train_df.loc[train_idx][CFG.target]
        valid_x, valid_y = train_df.loc[valid_idx], train_df.loc[valid_idx][CFG.target]
        test_x = test_df.copy()

        train_x = train_x[train_feats]
        valid_x = valid_x[train_feats]
        test_x = test_x[train_feats]

        print(f'train_x.shape = {train_x.shape}, train_y.shape = {train_y.shape}')
        print(f'valid_x.shape = {valid_x.shape}, valid_y.shape = {valid_y.shape}')

        params = {
            "objective" : "multiclass", 
            "num_class": CFG.num_class,
            "boosting" : "gbdt",
            "metric" : "None", 
            'class_weight': 'balanced',
            'max_bin': 128,
            'num_leaves': 48,
            'feature_fraction': 0.8,
            'learning_rate': 0.05,
            "seed": seed,
            "verbosity": -1
        }

        # ------- Start Training
        model = lgb.LGBMClassifier(**params)
        model.fit(
            train_x, train_y,
            eval_set=(valid_x, valid_y),
            eval_metric=feval_f1,
            verbose=False,
            early_stopping_rounds=100,
        )
        best_iter = model.best_iteration_

        # validation prediction
        preds = model.predict_proba(valid_x, num_iteration=best_iter)
        oof_preds[valid_idx] += preds / len(CFG.seeds)
        fold_score = get_score(valid_y, np.argmax(preds, axis=1))
        print(f'Fold={fold} fold_score = {fold_score}')

        # test prediction
        preds = model.predict_proba(test_x, num_iteration=best_iter)
        test_preds[:] += preds / (len(CFG.seeds) * CFG.n_splits)

oof_preds_set.append(oof_preds)
test_preds_set.append(test_preds)

oof_score = get_score(train_df[CFG.target].values, np.argmax(oof_preds, axis=1))
print(f'LGB seed={seed} oof_score = {oof_score}')

  0%|          | 0/5 [00:00<?, ?it/s]

train_x.shape = (23730, 188), train_y.shape = (23730,)
valid_x.shape = (6190, 188), valid_y.shape = (6190,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=0 fold_score = {'f1': 0.18114}
train_x.shape = (23890, 188), train_y.shape = (23890,)
valid_x.shape = (6030, 188), valid_y.shape = (6030,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=1 fold_score = {'f1': 0.1752}
train_x.shape = (23850, 188), train_y.shape = (23850,)
valid_x.shape = (6070, 188), valid_y.shape = (6070,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=2 fold_score = {'f1': 0.20671}
train_x.shape = (23900, 188), train_y.shape = (23900,)
valid_x.shape = (6020, 188), valid_y.shape = (6020,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=3 fold_score = {'f1': 0.16425}
train_x.shape = (24310, 188), train_y.shape = (24310,)
valid_x.shape = (5610, 188), valid_y.shape = (5610,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=4 fold_score = {'f1': 0.19724}
train_x.shape = (23890, 188), train_y.shape = (23890,)
valid_x.shape = (6030, 188), valid_y.shape = (6030,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=0 fold_score = {'f1': 0.16141}
train_x.shape = (23890, 188), train_y.shape = (23890,)
valid_x.shape = (6030, 188), valid_y.shape = (6030,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=1 fold_score = {'f1': 0.19556}
train_x.shape = (23790, 188), train_y.shape = (23790,)
valid_x.shape = (6130, 188), valid_y.shape = (6130,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=2 fold_score = {'f1': 0.18581}
train_x.shape = (23880, 188), train_y.shape = (23880,)
valid_x.shape = (6040, 188), valid_y.shape = (6040,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=3 fold_score = {'f1': 0.22147}
train_x.shape = (24230, 188), train_y.shape = (24230,)
valid_x.shape = (5690, 188), valid_y.shape = (5690,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=4 fold_score = {'f1': 0.19657}
train_x.shape = (23890, 188), train_y.shape = (23890,)
valid_x.shape = (6030, 188), valid_y.shape = (6030,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=0 fold_score = {'f1': 0.18898}
train_x.shape = (23870, 188), train_y.shape = (23870,)
valid_x.shape = (6050, 188), valid_y.shape = (6050,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=1 fold_score = {'f1': 0.19932}
train_x.shape = (23790, 188), train_y.shape = (23790,)
valid_x.shape = (6130, 188), valid_y.shape = (6130,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=2 fold_score = {'f1': 0.22339}
train_x.shape = (23770, 188), train_y.shape = (23770,)
valid_x.shape = (6150, 188), valid_y.shape = (6150,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=3 fold_score = {'f1': 0.21488}
train_x.shape = (24360, 188), train_y.shape = (24360,)
valid_x.shape = (5560, 188), valid_y.shape = (5560,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=4 fold_score = {'f1': 0.16726}
train_x.shape = (23920, 188), train_y.shape = (23920,)
valid_x.shape = (6000, 188), valid_y.shape = (6000,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=0 fold_score = {'f1': 0.16977}
train_x.shape = (23860, 188), train_y.shape = (23860,)
valid_x.shape = (6060, 188), valid_y.shape = (6060,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=1 fold_score = {'f1': 0.19786}
train_x.shape = (23840, 188), train_y.shape = (23840,)
valid_x.shape = (6080, 188), valid_y.shape = (6080,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=2 fold_score = {'f1': 0.17452}
train_x.shape = (23810, 188), train_y.shape = (23810,)
valid_x.shape = (6110, 188), valid_y.shape = (6110,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=3 fold_score = {'f1': 0.18927}
train_x.shape = (24250, 188), train_y.shape = (24250,)
valid_x.shape = (5670, 188), valid_y.shape = (5670,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=4 fold_score = {'f1': 0.18385}
train_x.shape = (23740, 188), train_y.shape = (23740,)
valid_x.shape = (6180, 188), valid_y.shape = (6180,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=0 fold_score = {'f1': 0.19293}
train_x.shape = (23770, 188), train_y.shape = (23770,)
valid_x.shape = (6150, 188), valid_y.shape = (6150,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=1 fold_score = {'f1': 0.19136}
train_x.shape = (23830, 188), train_y.shape = (23830,)
valid_x.shape = (6090, 188), valid_y.shape = (6090,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=2 fold_score = {'f1': 0.20508}
train_x.shape = (23910, 188), train_y.shape = (23910,)
valid_x.shape = (6010, 188), valid_y.shape = (6010,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=3 fold_score = {'f1': 0.18377}
train_x.shape = (24430, 188), train_y.shape = (24430,)
valid_x.shape = (5490, 188), valid_y.shape = (5490,)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Fold=4 fold_score = {'f1': 0.17416}
LGB seed=2025 oof_score = {'f1': 0.18581}

len(oof_preds_set), len(test_preds_set)

(1, 1)

oof_preds = np.mean(oof_preds_set, axis=0)
test_preds = np.mean(test_preds_set, axis=0)

print ('============ mean oof_preds_set ============')
y_true = train_df[CFG.target].values
y_pred = np.argmax(oof_preds, axis=1)
oof_score = get_score(y_true, y_pred)
print('mean oof_score = {}'.format(oof_score))

============ mean oof_preds_set ============
mean oof_score = {'f1': 0.18581}

##############################################
# SUBMISSION
##############################################

sample_submission = test_df[['id']].copy()
sample_submission['y'] = 0
test_df['preds'] = np.argmax(test_preds, axis=1).copy()
sample_submission[CFG.target] = test_df[['id', 'preds']].sort_values('id')['preds'].values
oof_score = round(oof_score['f1'], 6)
subm_path = f'./submission_{oof_score}.csv'
sample_submission.to_csv(subm_path, index=False)
print ('subm file created: {}'.format(subm_path))

subm file created: ./submission_0.18581.csv

Rauta Private 4th Solution (Private score = 0.19004)

SUMMARY

アプローチ

添付データ

new user