[LB: 0.600671] doi_citesだけ利用してLightGBMでモデリング
trainおよびtestの全てのサンプルに対してdoi_citesが与えられています。
doi_citesは目的変数のcites予測に大いに貢献すると考えられるので、doi_citesだけでcitesを予測した時の精度を確認します。
df_train = pd.read_json("/home/shogosu/probspace_article_cite/train_data.json", lines=True)
df_test = pd.read_json("/home/shogosu/probspace_article_cite/test_data.json", lines=True)
df_train_exists_cites = df_train[~df_train.cites.isna()]
def target_stratify_kfold(df, n_bins=500, folds=5, seed=2021):
_df = pd.DataFrame()
_df["bins"] = pd.cut(df.cites, n_bins, labels=False)
fold = np.zeros(len(_df), dtype=int)
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
for i_fold, (_, val_idx) in enumerate(skf.split(_df.bins, _df.bins)):
fold[val_idx] = i_fold
return fold
fold_idx = target_stratify_kfold(df_train_exists_cites)
train_x = df_train_exists_cites.doi_cites
train_y = np.log1p(df_train_exists_cites.cites)
test_x = df_test.doi_cites
class FoldsAverageLGBM:
def __init__(self, folds_idx, n_folds):
self.folds_idx = folds_idx
self.n_folds = n_folds
self.models = []
def fit(self, lgb_params, train_x, train_y):
oof_preds = np.zeros_like(train_y)
self.train_x = train_x.values.reshape(-1, 1)
self.train_y = train_y.values
for i_fold in range(self.n_folds):
tr_idx = np.argwhere(self.folds_idx != i_fold).reshape(-1)
va_idx = np.argwhere(self.folds_idx == i_fold).reshape(-1)
tr_x, va_x = self.train_x[tr_idx], self.train_x[va_idx]
tr_y, va_y = self.train_y[tr_idx], self.train_y[va_idx]
lgb_train_dataset = lgb.Dataset(tr_x, tr_y)
lgb_valid_dataset = lgb.Dataset(va_x, va_y)
model = lgb.train(lgb_params, lgb_train_dataset, valid_sets=[lgb_valid_dataset], verbose_eval=10)
self.models.append(model)
oof_pred = model.predict(va_x)
oof_preds[va_idx] = oof_pred
self.oof_preds = oof_preds
def predict(self, test_x):
preds = []
for model in self.models:
pred = model.predict(test_x)
preds.append(pred)
preds = np.mean(preds, axis=0)
return preds
folds_average_lgbm = FoldsAverageLGBM(fold_idx, 5)
lgb_params = {
"seed": 2021,
"objective": "rmse",
"verbosity": 1,
"learning_rate": 0.01,
"num_iterations": 1000,
"early_stopping_round": 100
}
folds_average_lgbm.fit(lgb_params, train_x, train_y)
np.sqrt(mean_squared_error(train_y, folds_average_lgbm.oof_preds))
> 0.596959085291851
ys_pred = folds_average_lgbm.predict(test_x.values.astype(np.float32).reshape(-1, 1))
ys_pred = np.expm1(ys_pred)
df_sub = pd.DataFrame()
df_sub["id"] = df_test["id"]
df_sub["cites"] = ys_pred
5-fold OOF RMSLE |
Public LB RMSLE |
0.596959085291851 |
0.600671 |