LightGBM Base line (CV: 0.560154 / LB 0.564024)
gensimによる'abstract'ベクトル化データ + 'doi_cites'データによる LightGBM Base lineコードを共有いたします。ご活用いただければと思います。
# ------------------------------------------------------------------------------
# ProbSpace: 論文の被引用数予測
# ------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
from tqdm import tqdm
import gensim
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
# ------------------------------------------------------------------------------
# Start-up
# ------------------------------------------------------------------------------
NFOLDS = 5
SEED = 42
def set_seed(seed=42):
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
set_seed(SEED)
# ------------------------------------------------------------------------------
# File and model path definition
# ------------------------------------------------------------------------------
DATA_PATH = Path("../../input/Predicting_the_number_of_citations_to_a_paper/")
FEATURE_PATH = Path("../../features/")
MODEL_PATH = Path("../../models/")
train_file = DATA_PATH / "train_data.json"
test_file = DATA_PATH / "test_data.json"
# ------------------------------------------------------------------------------
# Define gensim model
# ------------------------------------------------------------------------------
emb_model = gensim.models.KeyedVectors.load_word2vec_format(MODEL_PATH / "GoogleNews-vectors-negative300.bin", binary=True)
# ------------------------------------------------------------------------------
# Define data iterator
# ------------------------------------------------------------------------------
def get_data_iter(file_path):
with open(file_path, 'r') as f:
for jason_line in f:
yield jason_line
# ------------------------------------------------------------------------------
# Read train and test data
# ------------------------------------------------------------------------------
train = []
train_feat = []
target = []
train_iter = get_data_iter(train_file)
for line in tqdm(train_iter, desc="train", total=851_524):
data = json.loads(line)
if 'cites' in data: # 'cites' が nan のデータは除外
abstract = data['abstract']
doi_cites = [np.log1p(int(data['doi_cites']))]
cites = int(data['cites'])
# 'abstract' を gensim でベクトル化
emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0)
train.append(emb_abstract)
train_feat.append(doi_cites)
target.append(cites)
test = []
test_feat = []
test_index = []
test_iter = get_data_iter(test_file)
for line in tqdm(test_iter, desc="test", total=59_084):
data = json.loads(line)
abstract = data['abstract']
doi_cites = [np.log1p(int(data['doi_cites']))]
emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0)
test.append(emb_abstract)
test_feat.append(doi_cites)
test_index.append(data['id'])
train = np.concatenate([np.array(train), np.array(train_feat)], axis=1)
target = np.array(np.log1p(target))
test = np.concatenate([np.array(test), np.array(test_feat)], axis=1)
# ------------------------------------------------------------------------------
# Check the data
# ------------------------------------------------------------------------------
print(train.shape)
print(target.shape)
print(test.shape)
# --------------------------------------
# Check and save optimization history
# --------------------------------------
lgb_params = {'objective': 'root_mean_squared_error',
'boosting_type': 'gbdt',
'n_estimators': 50000,
'learning_rate': 0.001,
'num_leaves': 64,
'min_child_samples': 30,
'colsample_bytree': 0.5,
'subsample': 0.5,
'subsample_freq': 3,
'reg_alpha': 8,
'reg_lambda': 2,
'random_state': SEED
}
# --------------------------------------
# Training and prediction
# --------------------------------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
lgb_oof = np.zeros(train.shape[0])
lgb_pred = 0
for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train)):
X_train, y_train = train[trn_idx], target[trn_idx]
X_valid, y_valid = train[val_idx], target[val_idx]
X_test = test
# LightGBM
model = lgb.LGBMRegressor(**lgb_params)
model.fit(X_train, y_train,
eval_set=(X_valid, y_valid),
eval_metric='rmse',
verbose=False,
early_stopping_rounds=500
)
lgb_oof[val_idx] = model.predict(X_valid)
lgb_pred += model.predict(X_test) / NFOLDS
rmsle = mean_squared_error(y_valid, lgb_oof[val_idx], squared=False)
print(f"fold {fold} lgb score: {rmsle}")
rmsle = mean_squared_error(target, lgb_oof, squared=False)
print("+-" * 40)
print(f"score: {rmsle}")
print(f"model score: {model.score(train, target)}")
# ------------------------------------------------------------------------------
# submit the results
# ------------------------------------------------------------------------------
test_predicted = np.expm1(lgb_pred)
submit_df = pd.DataFrame({'id': test_index})
submit_df['cites'] = np.where(test_predicted < 0, 0, test_predicted)
submit_df.to_csv("submission.csv", index=False)