# 特徴量作成用の関数を実行する関数
def preprocess(input_df, base_df):
seed_everything(seed=SEED)
output_df = input_df.copy()
# aggrigation
df = base_df.groupby(['gameID', 'outCount']).median().reset_index()
agg_df = aggregation(df, ['gameID'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['gameID'], how='left')
agg_df = aggregation(df, ['gameID', 'inningHalf'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['gameID', 'inningHalf'], how='left')
agg_df = aggregation(df, ['b1_b2_b3'], ['S', 'B'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['b1_b2_b3'], how='left')
agg_df = aggregation(df, ['B_S_O'], ['b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on= ['B_S_O'], how='left')
agg_df = aggregation(df, ['ballIdx'], ['O', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['ballIdx'], how='left')
agg_df = aggregation(df, ['baseIdx'], ['B', 'S'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['baseIdx'], how='left')
agg_df = aggregation(df, ['inningNumber'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['inningNumber'], how='left')
agg_df = aggregation(df, ['outCount'], ['S', 'B', 'b1', 'b2', 'b3'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['outCount'], how='left')
agg_df = smooth_aggregation(df, ['pitcher'], ['baseIdx', 'ballIdx'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['pitcher'], how='left')
output_df[agg_df.columns] = output_df[agg_df.columns].fillna(0)
agg_df = smooth_aggregation(df, ['batter'], ['baseIdx', 'ballIdx'], ['mean', 'std'])
output_df = pd.merge(output_df, agg_df, on=['batter'], how='left')
output_df[agg_df.columns] = output_df[agg_df.columns].fillna(0)
# pivot
output_df = get_pivot_PCA18_features(output_df, n=4, value_col='ballIdx')
output_df = get_pivot_PCA18_features(output_df, n=4, value_col='baseIdx')
output_df = get_pivot_PCA27_features(output_df, n=6, value_col='ballIdx')
output_df = get_pivot_PCA27_features(output_df, n=6, value_col='baseIdx')
output_df = get_pivot_PCA54_features(output_df, n=8, value_col='ballIdx')
output_df = get_pivot_PCA54_features(output_df, n=8, value_col='baseIdx')
output_df = get_pivot_NMF18_features(output_df, n=2, value_col='ballIdx')
output_df = get_pivot_NMF18_features(output_df, n=2, value_col='baseIdx')
output_df = get_pivot_NMF27_features(output_df, n=2, value_col='ballIdx')
output_df = get_pivot_NMF27_features(output_df, n=2, value_col='baseIdx')
output_df = get_pivot_NMF54_features(output_df, n=2, value_col='ballIdx')
output_df = get_pivot_NMF54_features(output_df, n=2, value_col='baseIdx')
# next/previous
output_df = get_next_data(output_df, value_col='b1_b2_b3', nan_value=8)
output_df = get_next_diff(output_df, value_col='b1_b2_b3', nan_value=8)
output_df = get_prev_data(output_df, value_col='b1_b2_b3', nan_value=8)
output_df = get_prev_diff(output_df, value_col='b1_b2_b3', nan_value=8)
output_df['runnerCombi'] = output_df['b1_b2_b3'] + 8 * output_df['next_b1_b2_b3']
# TF-IDF
output_df = get_tfidf(output_df, term_col='batter', document_col='subGameID')
output_df = get_tfidf(output_df, term_col='b1_b2_b3', document_col='subGameID')
output_df = get_tfidf(output_df, term_col='B_S_O', document_col='subGameID')
# skip
output_df = get_skip(output_df)
output_df['move4'] = (output_df['next_b1_b2_b3'] == 0) & (output_df['next_skip'] == 2)
output_df['move3'] = (output_df['next_b1_b2_b3'] == 4) & (output_df['next_skip'] == 2)
output_df['move2'] = (output_df['next_b1_b2_b3']%4 == 2) & (output_df['next_skip'] == 2)
output_df['move1'] = (output_df['next_b1_b2_b3']%2 == 1) & (output_df['next_skip'] == 2)
# target encoding
enc_cols = [
'bottomTeam', 'topTeam', 'pitcherCommon', 'batterCommon',
'B', 'S', 'O', 'b1', 'b2', 'b3',
'b1_b2_b3', 'B_S_O',
'ballIdx', 'baseIdx',
'next_b1_b2_b3', 'next_diff_b1_b2_b3', 'prev_b1_b2_b3', 'prev_diff_b1_b2_b3', 'runnerCombi',
]
for col in enc_cols:
for i in range(N_CLASS):
output_df = target_encoding(output_df, col, 'gameID', f'y{i}')
return reduce_mem_usage(output_df)