研究価値を最大化させるワーディングとは
Oregin
BIZENさんのBase lineを参考にさせていただき、'abstract'を前処理(Stopwordや句読点などの削除)を行ったサンプルコードです。ご参考までご活用ください。BIZENさんのBaseLine→https://prob.space/competitions/citation_prediction/discussions/BIZEN-Post1c5e6bf922379f4e5831
CV= 0.55577 LB= 0.557464 でした。
ディレクトリ構成
※GoogleNews-vectors-negative300.binは、こちらからダウンロード→https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# ------------------------------------------------------------------------------ # 各種ライブラリのインポート # ------------------------------------------------------------------------------ import pandas as pd import numpy as np import json import os import random import string import re from pathlib import Path from tqdm import tqdm import gensim import lightgbm as lgb from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error,mean_squared_log_error from nltk.corpus import stopwords from bs4 import BeautifulSoup
# Stopwordsのダウンロード import nltk nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
# StopWord の再定義 stop = set(stopwords.words("english")) # 句読点の追加。string.punctuation = ['!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] punctuation = list(string.punctuation) # 手動で追加 org_stop = ["Subject"] # stopwordsの定義更新 add_stop = punctuation + org_stop stop.update(add_stop)
# htmlの分割 def strip_html(text): soup = BeautifulSoup(text, "html.parser") return soup.get_text() # []で囲まれた文章の削除(脚注、linkなど) def remove_between_square_brackets(text): return re.sub('\[[^]]*\]', '', text) # URLの削除 def remove_between_square_brackets(text): return re.sub(r'http\S+', '', text) # stopwordsの削除 def remove_stopwords(text): final_text = [] for i in text.split(): if i.strip().lower() not in stop: if i.strip().isalpha(): final_text.append(i.strip()) return " ".join(final_text) # 上記の関数をまとめて適用する関数を定義 def denoise_text(text): text = strip_html(text) text = remove_between_square_brackets(text) text = remove_stopwords(text) return text
# ------------------------------------------------------------------------------ # 各種定数の設定 # ------------------------------------------------------------------------------ NFOLDS = 5 SEED = 42 def set_seed(seed=42): random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) np.random.seed(seed) set_seed(SEED) # ------------------------------------------------------------------------------ # 各種パスの設定 # ------------------------------------------------------------------------------ DATA_PATH = Path("../data/") FEATURE_PATH = Path("../features/") MODEL_PATH = Path("../data/") train_file = DATA_PATH / "train_data.json" test_file = DATA_PATH / "test_data.json" # ------------------------------------------------------------------------------ # ベクトル化のモデルインスタンス作成 # ------------------------------------------------------------------------------ emb_model = gensim.models.KeyedVectors.load_word2vec_format(MODEL_PATH / "GoogleNews-vectors-negative300.bin", binary=True) # ------------------------------------------------------------------------------ # JSONを読み込むためのイテレータの定義 # ------------------------------------------------------------------------------ def get_data_iter(file_path): with open(file_path, 'r') as f: for jason_line in f: yield jason_line # ------------------------------------------------------------------------------ # 訓練データとテストデータの読み込み(特徴量作成) # ------------------------------------------------------------------------------ train = [] train_feat = [] target = [] train_iter = get_data_iter(train_file) for line in tqdm(train_iter, desc="train", total=851_524): data = json.loads(line) if 'cites' in data: # 'cites' が nan のデータは除外 # 前処理の関数の適用(追加した部分) data['abstract']=denoise_text(data['abstract']) ############################################### abstract = data['abstract'] doi_cites = [np.log1p(int(data['doi_cites']))] cites = int(data['cites']) # 'abstract' を gensim でベクトル化 emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0) train.append(emb_abstract) train_feat.append(doi_cites) target.append(cites) test = [] test_feat = [] test_index = [] test_iter = get_data_iter(test_file) for line in tqdm(test_iter, desc="test", total=59_084): data = json.loads(line) # 前処理の関数の適用(追加した部分) data['abstract']=denoise_text(data['abstract']) ############################################### abstract = data['abstract'] doi_cites = [np.log1p(int(data['doi_cites']))] emb_abstract = np.mean([emb_model[w] for w in abstract.split(' ') if w in emb_model], axis=0) test.append(emb_abstract) test_feat.append(doi_cites) test_index.append(data['id']) train = np.concatenate([np.array(train), np.array(train_feat)], axis=1) target = np.array(np.log1p(target)) test = np.concatenate([np.array(test), np.array(test_feat)], axis=1) # ------------------------------------------------------------------------------ # 各データのサイズ表示 # ------------------------------------------------------------------------------ print(train.shape) print(target.shape) print(test.shape)
train: 100%|██████████| 851524/851524 [00:25<00:00, 33582.50it/s] test: 100%|██████████| 59084/59084 [00:27<00:00, 2147.35it/s] (15117, 301) (15117,) (59084, 301)
#####################################################3 ### LGBで学習、予測する関数の定義 ######################################################## def Train_and_Pred(train,target,test): # -------------------------------------- # パラメータ定義 # -------------------------------------- lgb_params = { 'objective': 'root_mean_squared_error', 'boosting_type': 'gbdt', 'n_estimators': 50000, 'colsample_bytree': 0.5, 'subsample': 0.5, 'subsample_freq': 3, 'reg_alpha': 8, 'reg_lambda': 2, 'random_state': SEED, "bagging_fraction": 0.5520399476847848, "bagging_freq": 1, "feature_fraction": 0.4436319472771827, "lambda_l1": 0.01113869595673112, "lambda_l2": 8.706009358617911e-07, "learning_rate": 0.012307412937706345, "min_child_samples": 18, "num_leaves": 8, } # -------------------------------------- # 学習と予測 # -------------------------------------- kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED) lgb_oof = np.zeros(train.shape[0]) lgb_pred = 0 for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train)): X_train, y_train = train[trn_idx], target[trn_idx] X_valid, y_valid = train[val_idx], target[val_idx] X_test = test # LightGBM model = lgb.LGBMRegressor(**lgb_params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), eval_metric='rmse', verbose=False, early_stopping_rounds=500 ) lgb_oof[val_idx] = model.predict(X_valid) lgb_pred += model.predict(X_test) / NFOLDS rmsle = mean_squared_error(y_valid, lgb_oof[val_idx], squared=False) print(f"fold {fold} lgb score: {rmsle}") rmsle = mean_squared_error(target, lgb_oof, squared=False) print("+-" * 40) print(f"score: {rmsle}") print(f"model score: {model.score(train, target)}") # ------------------------------------------------------------------------------ # 提出ファイルの作成 # ------------------------------------------------------------------------------ test_predicted = np.expm1(lgb_pred) submit_df = pd.DataFrame({'id': test_index}) submit_df['cites'] = np.where(test_predicted < 0, 0, test_predicted) submit_df.to_csv("submission.csv", index=False) return rmsle
#学習と予測の実行 Train_and_Pred(train,target,test)
fold 0 lgb score: 0.5704310496466114 fold 1 lgb score: 0.5395853081455577 fold 2 lgb score: 0.5702297016021272 fold 3 lgb score: 0.5493465172531439 fold 4 lgb score: 0.5485768946230044 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+- score: 0.5557739827807111 model score: 0.843415098095516
0.5557739827807111
ものすごく参考にさせていただいております。
In [4] のテキスト前処理のところで、URLを削除する関数の定義名が、一個前のカッコを消す関数と同じになっています。 結果としてカッコが消去できていないようです。
すでにお気づきかもしれませんが、一応の形でここにポストしておきます。
ご指摘ありがとうございました。 確かにその通りですね。以下のような修正が必要になりますね。 大変失礼いたしました。 def remove_between_square_brackets(text): → remove_URL(text):