train: 100%|██████████| 851524/851524 [00:25<00:00, 33582.50it/s]
test: 100%|██████████| 59084/59084 [00:27<00:00, 2147.35it/s]
(15117, 301)
(15117,)
(59084, 301)
#####################################################3
### LGBで学習、予測する関数の定義
########################################################
def Train_and_Pred(train,target,test):
# --------------------------------------
# パラメータ定義
# --------------------------------------
lgb_params = {
'objective': 'root_mean_squared_error',
'boosting_type': 'gbdt',
'n_estimators': 50000,
'colsample_bytree': 0.5,
'subsample': 0.5,
'subsample_freq': 3,
'reg_alpha': 8,
'reg_lambda': 2,
'random_state': SEED,
"bagging_fraction": 0.5520399476847848,
"bagging_freq": 1,
"feature_fraction": 0.4436319472771827,
"lambda_l1": 0.01113869595673112,
"lambda_l2": 8.706009358617911e-07,
"learning_rate": 0.012307412937706345,
"min_child_samples": 18,
"num_leaves": 8,
}
# --------------------------------------
# 学習と予測
# --------------------------------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
lgb_oof = np.zeros(train.shape[0])
lgb_pred = 0
for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train)):
X_train, y_train = train[trn_idx], target[trn_idx]
X_valid, y_valid = train[val_idx], target[val_idx]
X_test = test
# LightGBM
model = lgb.LGBMRegressor(**lgb_params)
model.fit(X_train, y_train,
eval_set=(X_valid, y_valid),
eval_metric='rmse',
verbose=False,
early_stopping_rounds=500
)
lgb_oof[val_idx] = model.predict(X_valid)
lgb_pred += model.predict(X_test) / NFOLDS
rmsle = mean_squared_error(y_valid, lgb_oof[val_idx], squared=False)
print(f"fold {fold} lgb score: {rmsle}")
rmsle = mean_squared_error(target, lgb_oof, squared=False)
print("+-" * 40)
print(f"score: {rmsle}")
print(f"model score: {model.score(train, target)}")
# ------------------------------------------------------------------------------
# 提出ファイルの作成
# ------------------------------------------------------------------------------
test_predicted = np.expm1(lgb_pred)
submit_df = pd.DataFrame({'id': test_index})
submit_df['cites'] = np.where(test_predicted < 0, 0, test_predicted)
submit_df.to_csv("submission.csv", index=False)
return rmsle