Oregin
BIZENさんのBase lineを参考にさせていただき、'abstract'を前処理(Stopwordや句読点などの削除)を行ったサンプルコードです。ご参考までご活用ください。
BIZENさんのBaseLine→https://prob.space/competitions/citation_prediction/discussions/BIZEN-Post1c5e6bf922379f4e5831
CV= 0.55577 LB= 0.557464 でした。
ディレクトリ構成
※GoogleNews-vectors-negative300.binは、こちらからダウンロード→https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# ------------------------------------------------------------------------------
# 各種ライブラリのインポート
# ------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import json
import os
import random
import string
import re
from pathlib import Path
from tqdm import tqdm
import gensim
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_squared_log_error
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
# Stopwordsのダウンロード
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
# StopWord の再定義
stop = set(stopwords.words("english"))
# 句読点の追加。string.punctuation = ['!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
punctuation = list(string.punctuation)
# 手動で追加
org_stop = ["Subject"]
# stopwordsの定義更新
add_stop = punctuation + org_stop
stop.update(add_stop)
# htmlの分割
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
# []で囲まれた文章の削除(脚注、linkなど)
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
# URLの削除
def remove_between_square_brackets(text):
return re.sub(r'http\S+', '', text)
# stopwordsの削除
def remove_stopwords(text):
final_text = []
for i in text.split():
if i.strip().lower() not in stop:
if i.strip().isalpha():
final_text.append(i.strip())
return " ".join(final_text)
# 上記の関数をまとめて適用する関数を定義
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
text = remove_stopwords(text)
return text
# ------------------------------------------------------------------------------
# 各種定数の設定
# ------------------------------------------------------------------------------
NFOLDS = 5
SEED = 42
def set_seed(seed=42):
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
set_seed(SEED)
# ------------------------------------------------------------------------------
# 各種パスの設定
# ------------------------------------------------------------------------------
DATA_PATH = Path("../data/")
FEATURE_PATH = Path("../features/")
MODEL_PATH = Path("../data/")
train_file = DATA_PATH / "train_data.json"
test_file = DATA_PATH / "test_data.json"
# ------------------------------------------------------------------------------
# ベクトル化のモデルインスタンス作成
# ------------------------------------------------------------------------------
emb_model = gensim.models.KeyedVectors.load_word2vec_format(MODEL_PATH / "GoogleNews-vectors-negative300.bin", binary=True)
# ------------------------------------------------------------------------------
# JSONを読み込むためのイテレータの定義
# ------------------------------------------------------------------------------
def get_data_iter(file_path):
with open(file_path, 'r') as f:
for jason_line in f:
yield jason_line
# ------------------------------------------------------------------------------
# 訓練データとテストデータの読み込み(特徴量作成)
# ------------------------------------------------------------------------------
train = []
train_feat = []
target = []
train_iter = get_data_iter(train_file)
for line in tqdm(train_iter, desc="train", total=851_524):
data = json.loads(line)
if 'cites' in data: # 'cites' が nan のデータは除外
# 前処理の関数の適用(追加した部分)
data['abstract']=denoise_text(data['abstract'])
###############################################
abstract = data['abstract']
doi_cites = [np.log1p(int(data['doi_cites']))]
cites = int(data['cites'])
# 'abstract' を gensim でベクトル化
emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0)
train.append(emb_abstract)
train_feat.append(doi_cites)
target.append(cites)
test = []
test_feat = []
test_index = []
test_iter = get_data_iter(test_file)
for line in tqdm(test_iter, desc="test", total=59_084):
data = json.loads(line)
# 前処理の関数の適用(追加した部分)
data['abstract']=denoise_text(data['abstract'])
###############################################
abstract = data['abstract']
doi_cites = [np.log1p(int(data['doi_cites']))]
emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0)
test.append(emb_abstract)
test_feat.append(doi_cites)
test_index.append(data['id'])
train = np.concatenate([np.array(train), np.array(train_feat)], axis=1)
target = np.array(np.log1p(target))
test = np.concatenate([np.array(test), np.array(test_feat)], axis=1)
# ------------------------------------------------------------------------------
# 各データのサイズ表示
# ------------------------------------------------------------------------------
print(train.shape)
print(target.shape)
print(test.shape)
train: 100%|██████████| 851524/851524 [00:25<00:00, 33582.50it/s] test: 100%|██████████| 59084/59084 [00:27<00:00, 2147.35it/s] (15117, 301) (15117,) (59084, 301)
#####################################################3
### LGBで学習、予測する関数の定義
########################################################
def Train_and_Pred(train,target,test):
# --------------------------------------
# パラメータ定義
# --------------------------------------
lgb_params = {
'objective': 'root_mean_squared_error',
'boosting_type': 'gbdt',
'n_estimators': 50000,
'colsample_bytree': 0.5,
'subsample': 0.5,
'subsample_freq': 3,
'reg_alpha': 8,
'reg_lambda': 2,
'random_state': SEED,
"bagging_fraction": 0.5520399476847848,
"bagging_freq": 1,
"feature_fraction": 0.4436319472771827,
"lambda_l1": 0.01113869595673112,
"lambda_l2": 8.706009358617911e-07,
"learning_rate": 0.012307412937706345,
"min_child_samples": 18,
"num_leaves": 8,
}
# --------------------------------------
# 学習と予測
# --------------------------------------
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
lgb_oof = np.zeros(train.shape[0])
lgb_pred = 0
for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train)):
X_train, y_train = train[trn_idx], target[trn_idx]
X_valid, y_valid = train[val_idx], target[val_idx]
X_test = test
# LightGBM
model = lgb.LGBMRegressor(**lgb_params)
model.fit(X_train, y_train,
eval_set=(X_valid, y_valid),
eval_metric='rmse',
verbose=False,
early_stopping_rounds=500
)
lgb_oof[val_idx] = model.predict(X_valid)
lgb_pred += model.predict(X_test) / NFOLDS
rmsle = mean_squared_error(y_valid, lgb_oof[val_idx], squared=False)
print(f"fold {fold} lgb score: {rmsle}")
rmsle = mean_squared_error(target, lgb_oof, squared=False)
print("+-" * 40)
print(f"score: {rmsle}")
print(f"model score: {model.score(train, target)}")
# ------------------------------------------------------------------------------
# 提出ファイルの作成
# ------------------------------------------------------------------------------
test_predicted = np.expm1(lgb_pred)
submit_df = pd.DataFrame({'id': test_index})
submit_df['cites'] = np.where(test_predicted < 0, 0, test_predicted)
submit_df.to_csv("submission.csv", index=False)
return rmsle
#学習と予測の実行
Train_and_Pred(train,target,test)
fold 0 lgb score: 0.5704310496466114 fold 1 lgb score: 0.5395853081455577 fold 2 lgb score: 0.5702297016021272 fold 3 lgb score: 0.5493465172531439 fold 4 lgb score: 0.5485768946230044 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- score: 0.5557739827807111 model score: 0.843415098095516
0.5557739827807111
baukmilz
ものすごく参考にさせていただいております。
In [4] のテキスト前処理のところで、URLを削除する関数の定義名が、一個前のカッコを消す関数と同じになっています。 結果としてカッコが消去できていないようです。
すでにお気づきかもしれませんが、一応の形でここにポストしておきます。