def get_pit_features(input_df, test_df):
input_df["speed"] = input_df["speed"].replace(
"-", "0km/h").apply(lambda x: int(x[:-4]))
_df = pd.DataFrame(input_df["pitcher"].unique(), columns=["pitcher"])
_df = _df.merge(input_df.groupby("pitcher")["pitchType"].agg(
pd.Series.nunique).rename("NumPitchType"),
left_on="pitcher",
right_index=True)
_df = _df.merge(
input_df.groupby("pitcher")["speed"].agg("max").rename("maxSpeed"),
left_on="pitcher",
right_index=True)
_df = _df.merge(input_df.groupby("pitcher")["speed"].agg("median").rename(
"medianSpeed"),
left_on="pitcher",
right_index=True)
_df = _df.merge(
input_df.groupby("pitcher")["speed"].agg("min").rename("minSpeed"),
left_on="pitcher",
right_index=True)
_df = _df.merge(
input_df.groupby("pitcher")["speed"].agg("std").rename("stdSpeed"),
left_on="pitcher",
right_index=True)
_df = _df.merge(input_df.groupby("pitcher").agg("size").rename("size"),
left_on="pitcher",
right_index=True)
tmp = _df.merge(input_df[input_df["y"] >= 4].groupby("pitcher").agg(
"size").rename("hitCount"),
left_on="pitcher",
right_index=True) # hit以上の回数(hit or homerun)
tmp = tmp[["pitcher", "hitCount"]]
pit_dict = dict(zip(tmp["pitcher"], tmp["hitCount"]))
_df["hitCount"] = _df["pitcher"].map(pit_dict).fillna(0).apply(
lambda x: int(x))
_df["hitRatio"] = _df["hitCount"] / _df["size"]
output_df = _df
output_df = pd.merge(input_df, _df, on="pitcher")
output_test = pd.merge(test_df, _df, on="pitcher")
return output_df, output_test
def get_bat_features(input_df, test_df):
_df = pd.DataFrame(input_df["batter"].unique(), columns=["batter"])
_df = _df.merge(input_df.groupby("batter").agg("size").rename("bat_size"),
left_on="batter",
right_index=True)
tmp = _df.merge(input_df[input_df["y"] >= 4].groupby("batter").agg(
"size").rename("bat_hitCount"),
left_on="batter",
right_index=True) # hit以上の回数(hit or homerun)
tmp = tmp[["batter", "bat_hitCount"]]
bat_dict = dict(zip(tmp["batter"], tmp["bat_hitCount"]))
_df["bat_hitCount"] = _df["batter"].map(bat_dict).fillna(0).apply(
lambda x: int(x))
_df["bat_hitRatio"] = _df["bat_hitCount"] / _df["bat_size"]
output_df = _df
output_df = pd.merge(input_df, _df, on="batter")
output_test = pd.merge(test_df, _df, on="batter")
return output_df, output_df
def get_gameinfo_features(input_df):
_df = input_df.copy()
_df[["b1", "b2", "b3"]] = _df[["b1", "b2", "b3"]].astype(int).astype(str)
output_df = _df.copy()
output_df["b1-b2-b3"] = output_df["b1"] + "-" + \
output_df["b2"] + "-" + output_df["b3"]
output_df["pitcherHand-batterHand"] = output_df["pitcherHand"].fillna(
"NaN") + "-" + output_df["batterHand"].fillna("NaN")
output_df = pd.DataFrame(output_df[["b1-b2-b3", "pitcherHand-batterHand"]])
return output_df
def get_homeaway_features(input_df):
df = input_df.copy()
home_dic = {
"日本ハム": ["札幌ドーム"],
"楽天": ["楽天生命パーク"],
"西武": ["メットライフ"],
"ロッテ": ["ZOZOマリン"],
"巨人": ["東京ドーム"],
"ヤクルト": ["神宮"],
"DeNA": ["横浜"],
"中日": ["ナゴヤドーム"],
"オリックス": ["京セラD大阪", "ほっと神戸"],
"阪神": ["甲子園"],
"広島": ["マツダスタジアム"],
"ソフトバンク": ["PayPayドーム"]
}
idxs = df["pitcher"].apply(lambda x: x[x.find("@") + 1:])
home_li = []
for idx, place in zip(idxs, df["place"]):
if place in home_dic[idx]:
home_li.append(True)
else:
home_li.append(False)
df["pit_home"] = pd.Series(home_li)
df["bat_home"] = ~df["pit_home"]
output_df = df[["pit_home", "bat_home"]]
return output_df
def get_numerical_raw_features(input_df):
cols = [
"totalPitchingCount", "B", "S", "O", "b1", "b2", "b3", "pit_home",
"bat_home"
]
output_df = input_df[cols].copy()
return output_df
def get_oe_features(input_df):
cols = [
'pitcher', 'pitcherHand', 'batter', 'batterHand', 'inning', 'b1-b2-b3',
"pitcherHand-batterHand", "bottomTeam", "topTeam", "place", "startTime"
]
encoder = ce.OrdinalEncoder()
output_df = encoder.fit_transform(input_df[cols])
return output_df.add_prefix("OE_")
def get_ce_features(input_df):
cols = [
'pitcher', 'pitcherHand', 'batter', 'batterHand', 'inning', 'b1-b2-b3',
"pitcherHand-batterHand", "bottomTeam", "topTeam", "place", "startTime"
]
encoder = ce.CountEncoder()
output_df = encoder.fit_transform(input_df[cols])
return output_df.add_prefix("CE_")
def get_tews_features(input_df, test_df):
'''
target encodingとsmoothingの実装
'''
cols = [
'pitcher', 'pitcherHand', 'batter', 'batterHand', 'inning', 'b1-b2-b3',
"pitcherHand-batterHand", "bottomTeam", "topTeam", "place", "startTime"
]
for c in cols:
tmp_df = pd.DataFrame({c: input_df[c], 'y': input_df["y"]})
dum = pd.get_dummies(
tmp_df["y"]).rename(columns=lambda x: "TE" + "_" + c + str(x + 1))
tmp_df = pd.concat([tmp_df, dum], axis=1).drop("y", axis=1)
df_li = []
for i in range(1, 9):
col = "TE" + "_" + c + str(i)
target_mean = tmp_df.groupby(c)[col].mean()
ni_dict = dict(tmp_df[c].value_counts())
for key, value in zip(ni_dict.keys(), ni_dict.values()):
lambda_ni = 1 / (1 + np.exp(-value / Config.k))
n_iy = tmp_df[tmp_df[c] == key][col].sum()
n_y = tmp_df[col].sum()
n_tr = len(tmp_df)
target_mean[key] = (lambda_ni*n_iy)/value + \
(1 - lambda_ni)*n_y/n_tr
df_li.append(target_mean)
target_df = pd.concat(df_li, axis=1)
test_df = pd.merge(test_df, target_df, on=c, how="left")
cv = gkf(X=input_df,
y=test_df,
n_splits=Config.n_folds,
random_state=46,
shuffle=True,
group=train["gameID"])
train_li = []
for idx_1, idx_2 in cv:
tmp_df2 = tmp_df.iloc[idx_1].reset_index()
df_li = []
col_list = []
for i in range(1, 9):
col = "TE" + "_" + c + str(i)
target_mean = tmp_df2.groupby(c)[col].mean()
ni_dict = dict(tmp_df2[c].value_counts())
for key, value in zip(ni_dict.keys(), ni_dict.values()):
lambda_ni = 1 / (1 + np.exp(-value / Config.k))
n_iy = tmp_df2[tmp_df2[c] == key][col].sum()
n_y = tmp_df2[col].sum()
n_tr = len(tmp_df2)
target_mean[key] = (lambda_ni*n_iy) / \
value + (1 - lambda_ni)*n_y/n_tr
df_li.append(target_mean)
col_list.append(col)
target_df2 = pd.concat(df_li, axis=1)
tmp_df3 = tmp_df.iloc[idx_2].reset_index()
append_df = pd.merge(tmp_df3, target_df2, on=c).iloc[:, 10:]
append_df["id__"] = tmp_df3["index"]
for i in range(len(append_df.columns)):
append_df.rename(
columns={append_df.columns[i]: append_df.columns[i][:-2]},
inplace=True)
train_li.append(append_df)
merge_df = pd.concat(train_li, axis=0)
merge_df.sort_values("id", inplace=True)
input_df = pd.merge(input_df, merge_df, on="id", how="left")
output_df = input_df
return output_df, test_df
def preprocess(train, test):
whole_df = pd.concat([train, test]).reset_index(drop=True)
whole_df = pd.concat([whole_df, get_gameinfo_features(whole_df)], axis=1)
whole_df = pd.concat([whole_df, get_homeaway_features(whole_df)], axis=1)
funcs = [
get_numerical_raw_features,
get_oe_features,
get_ce_features,
]
output_lst = []
for func in funcs:
_df = func(whole_df)
output_lst.append(_df)
output_df = pd.concat(output_lst, axis=1)
pre_train_x = output_df.iloc[:len(train)]
pre_test_x = output_df.iloc[len(train):].reset_index(drop=True)
funcs2 = [get_pit_features, get_bat_features]
output_lst2 = []
for func in funcs2:
pre2_train_x, pre2_test_x = func(train, test)
output_lst2.append([pre2_train_x, pre2_test_x])
_cols_to_use1 = output_lst2[0][0].columns.difference(
output_lst2[1][0].columns)
pre2_train_x = pd.concat(
[output_lst2[0][0][list(_cols_to_use1)], output_lst2[1][0]], axis=1)
_cols_to_use2 = output_lst2[0][1].columns.difference(
output_lst2[1][1].columns)
pre2_test_x = pd.concat(
[output_lst2[0][1][list(_cols_to_use2)], output_lst2[1][1]], axis=1)
_use1 = pre2_train_x.columns.difference(train.columns)
_use2 = pre2_test_x.columns.difference(train.columns)
train_x = pd.concat([pre_train_x, pre2_train_x[list(_use1)]], axis=1)
test_x = pd.concat([pre_test_x, pre2_test_x[list(_use2)]], axis=1)
pre_tr_3, pre_te_3 = get_tews_features(
pd.concat([train, get_gameinfo_features(train)], axis=1),
pd.concat([test, get_gameinfo_features(test)], axis=1))
_use_tr = pre_tr_3.columns.difference(train.columns)
_use_te = pre_te_3.columns.difference(test.columns)
train_x = pd.concat([train_x, pre_tr_3[list(_use_tr)]],
axis=1).drop(["b1-b2-b3", "pitcherHand-batterHand"],
axis=1)
test_x = pd.concat([test_x, pre_te_3[list(_use_te)]],
axis=1).drop(["b1-b2-b3", "pitcherHand-batterHand"],
axis=1)
return train_x, test_x