kotrying
今回は自然言語処理がテーマとなるため、デファクトスタンダードであるBERTを使用したいところです。しかしBERTの学習に用いるGPUを用意するのがネックになります。(Oreginさんの紹介されているBERTモデルはGoogle Colab上で動かしていますが、現在のColabではGPU利用に課金が必要となり、以前のように簡単に計算資源の確保ができません。)
そこでBERTを用いて抽出した特徴量を学習に利用することで、GPUが無くてもBERTの恩恵を得られるような方法を考えます。具体的には事前学習済みモデルへテキストデータを入力し、出力層の前の層からEmbedding特徴を抽出します。これを様々な事前学習済みモデルについて行い、多様な特徴を抽出します。最後に抽出した全特徴量を用いて何らかの学習器で予測を行います。この方法のメリットは以下にあります。
このアイデアはKaggleの「Feedback Prize - English Language Learning」で知ったもので、この内容も下記内容をそのまま試したものになります。
RAPIDS SVR - CV 0.450 - LB 0.44x
https://www.kaggle.com/code/cdeotte/rapids-svr-cv-0-450-lb-0-44x
上記の紹介では他のBERTモデルに匹敵するほどの精度を出せており、アンサンブル時にも活用できたようです。
!nvidia-smi
Sun Feb 26 22:13:47 2023 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 | | N/A 71C P8 14W / 70W | 0MiB / 15360MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
%%capture
!echo "deb http://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt-get update
!apt-get install gcsfuse
!mkdir -p /content/gcs
!gcsfuse bucket-paper-acception /content/gcs
%%capture
!pip install polars
! pip install transformers
! pip install sentencepiece
!pip install fontstyle
# ----------
# ライブラリ
# ----------
import os
import random
import numpy as np
import torch
from psutil import virtual_memory
import polars as pl
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm
import xgboost as xgb
import scipy.stats as stats
import lightgbm as lgbm
from sklearn.metrics import accuracy_score
import fontstyle
import warnings
warnings.simplefilter('ignore')
GPUを使用しない場合はDEVICE='cpu'
に変更します。
# ----------
# 設定
# ----------
num_fold = 5
seed = 0
DEVICE = "cuda" # "cpu" or "cuda"
tokenizer = None
BATCH_SIZE = 16
MAX_LEN = 768
# テキスト特徴として連結するカラム
txt_columns = ['title', 'keywords', 'abstract']
# ----------
# 関数
# ----------
def set_seed(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
current_device = torch.cuda.current_device()
print("Device:", torch.cuda.get_device_name(current_device))
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
def get_stratifiedkfold(train, target_col, n_splits, seed):
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
generator = kf.split(train.to_numpy(), train.get_column(target_col).to_numpy())
fold_array = np.zeros(len(train))
for fold, (_, idx_valid) in enumerate(generator):
fold_array[idx_valid] = fold
return fold_array
学習用にtitle, abstract, keywordsの要素数を特徴として追加しています。
# ----------
# データ
# ----------
train = pl.read_csv('/content/gcs/train.csv')
test = pl.read_csv('/content/gcs/test.csv')
sub = pl.read_csv('/content/gcs/submission.csv')
# ----------
# 前処理・特徴生成
# ----------
set_seed(seed)
# テキスト特徴の作成
# グループごとにFold数を設定
train =\
train.with_columns(
pl.concat_str(txt_columns, sep='. ').alias('txt_feat'),
# title
pl.when(pl.col('title').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
.then(0)
.otherwise(pl.col('title').str.to_lowercase().str.count_match(' ') + 1)
.alias('num_title'),
# abstract
pl.when(pl.col('abstract').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
.then(0)
.otherwise(pl.col('abstract').str.to_lowercase().str.count_match(' ') + 1)
.alias('num_abstract'),
# keywords
pl.when(pl.col('keywords').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
.then(0)
.otherwise(pl.col('keywords').str.to_lowercase().str.count_match(' ') + 1)
.alias('num_keywords'),
# group
pl.concat_str(['year', 'y'], sep='-').alias('group'),
)
test = \
test.with_columns(
pl.concat_str(txt_columns, sep='. ').alias('txt_feat'),
# title
pl.when(pl.col('title').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
.then(0)
.otherwise(pl.col('title').str.to_lowercase().str.count_match(' ') + 1)
.alias('num_title'),
# abstract
pl.when(pl.col('abstract').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
.then(0)
.otherwise(pl.col('abstract').str.to_lowercase().str.count_match(' ') + 1)
.alias('num_abstract'),
# keywords
pl.when(pl.col('keywords').str.to_lowercase().is_in(['', 'nan', '0', 'blank']))
.then(0)
.otherwise(pl.col('keywords').str.to_lowercase().str.count_match(' ') + 1)
.alias('num_keywords'),
)
display(train.head(3))
display(test.head(3))
Device: Tesla T4 Your runtime has 13.6 gigabytes of available RAM
id | title | year | abstract | keywords | y | txt_feat | num_title | num_abstract | num_keywords | group |
---|---|---|---|---|---|---|---|---|---|---|
i64 | str | i64 | str | str | i64 | str | u32 | u32 | u32 | str |
1 | "Hierarchical A... | 2018 | "We propose a n... | "generative, hi... | 0 | "Hierarchical A... | 4 | 155 | 7 | "2018-0" |
2 | "Learning to Co... | 2018 | "Words in natur... | "NLU, word embe... | 0 | "Learning to Co... | 8 | 130 | 5 | "2018-0" |
3 | "Graph2Seq: Sca... | 2018 | "Neural network... | "" | 0 | "Graph2Seq: Sca... | 6 | 143 | 0 | "2018-0" |
id | title | year | abstract | keywords | txt_feat | num_title | num_abstract | num_keywords |
---|---|---|---|---|---|---|---|---|
i64 | str | i64 | str | str | str | u32 | u32 | u32 |
1 | "StyleAlign: An... | 2022 | "In this paper,... | "StyleGAN, tran... | "StyleAlign: An... | 8 | 209 | 11 |
2 | "Embedding a ra... | 2021 | "We develop a t... | "Graph neural n... | "Embedding a ra... | 16 | 272 | 11 |
3 | "BBRefinement: ... | 2021 | "We present a c... | "object detecti... | "BBRefinement: ... | 11 | 152 | 6 |
# ----------
# BERT
# ----------
# Dataset
class EmbedDataset(torch.utils.data.Dataset):
def __init__(self, df):
self.df = df
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
text = self.df[idx, 'txt_feat']
tokens = tokenizer(
text,
None,
add_special_tokens=True,
padding='max_length',
truncation=True,
max_length=MAX_LEN,
return_tensors="pt")
tokens = {k:v.squeeze(0) for k,v in tokens.items()}
return tokens
# Pooling
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output.last_hidden_state.detach().cpu()
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
)
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)
def get_embeddings(MODEL_NM='', MAX_LEN=512, BATCH_SIZE=4, verbose=True):
global tokenizer, DEVICE
model = AutoModel.from_pretrained(MODEL_NM)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NM)
model = model.to(DEVICE)
model.eval()
# train
all_train_text_feats = []
for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
input_ids = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)
with torch.no_grad():
model_output = model(input_ids=input_ids,attention_mask=attention_mask)
sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
# Normalize the embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
sentence_embeddings = sentence_embeddings.squeeze(0).detach().cpu().numpy()
all_train_text_feats.extend(sentence_embeddings)
all_train_text_feats = np.array(all_train_text_feats)
if verbose:
print('Train embeddings shape',all_train_text_feats.shape)
# test
te_text_feats = []
for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
input_ids = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)
with torch.no_grad():
model_output = model(input_ids=input_ids,attention_mask=attention_mask)
sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
# Normalize the embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
sentence_embeddings = sentence_embeddings.squeeze(0).detach().cpu().numpy()
te_text_feats.extend(sentence_embeddings)
te_text_feats = np.array(te_text_feats)
if verbose:
print('Test embeddings shape',te_text_feats.shape)
# save feat
np.save(f"{MODEL_NM.split('/')[-1]}_train", all_train_text_feats)
np.save(f"{MODEL_NM.split('/')[-1]}_test", te_text_feats)
return all_train_text_feats, te_text_feats
ds_tr = EmbedDataset(train)
embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
batch_size=BATCH_SIZE,\
shuffle=False)
ds_te = EmbedDataset(test)
embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
batch_size=BATCH_SIZE,\
shuffle=False)
ここでは事前学習済みモデルに'deberta-v3-base'を使用しました。本来は'deberta-v3-large', 'deberta-v2-xlarge'など他のモデルからも特徴を抽出します。GPUと比べてCPUではかなり時間を要しますが、実行可能です。
%%time
MODEL_NM = 'microsoft/deberta-v3-base'
train_emb, test_emb = get_embeddings(MODEL_NM, MAX_LEN, BATCH_SIZE)
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight'] - This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
0%| | 0/311 [00:00<?, ?it/s]
Train embeddings shape (4974, 768)
0%| | 0/400 [00:00<?, ?it/s]
Test embeddings shape (6393, 768) CPU times: user 16min 24s, sys: 40 s, total: 17min 4s Wall time: 17min 27s
得られたEmbedding特徴と追加で作成した特徴から学習を行います。今回はXGBoost(LightGBM)を使用しました。
emb_col = [f'emb{i}' for i in range(train_emb.shape[1])]
train_emb_df = pl.DataFrame(train_emb, schema=emb_col)
train = pl.concat([train, train_emb_df], how='horizontal')
test_emb_df = pl.DataFrame(test_emb, schema=emb_col)
test = pl.concat([test, test_emb_df], how='horizontal')
# Run XGBoost
use_col = ['num_title', 'num_abstract', 'num_keywords'] + emb_col
test_x = test.select(use_col).to_numpy()
whole_va_preds = []
whole_test_preds = []
for seed in range(3):
print(fontstyle.apply(f'< Seed : {seed} >', 'BLACK/BOLD'))
set_seed(seed)
train = train.with_columns(
pl.Series(get_stratifiedkfold(train, 'group', num_fold, seed))
.alias('folds')
)
oof_preds = np.zeros((len(train), ), dtype=np.float32)
preds = []
for fold in range(num_fold):
tr_x = train.filter(pl.col('folds')!=fold).select(use_col).to_numpy()
tr_y = train.filter(pl.col('folds')!=fold).select('y').to_numpy()
va_x = train.filter(pl.col('folds')==fold).select(use_col).to_numpy()
va_y = train.filter(pl.col('folds')==fold).select('y').to_numpy()
params = {
'objective': 'binary:logistic',
'n_estimators': 10000,
'random_state': 0,
'learning_rate': 0.01,
'max_depth': 8,
'colsample_bytree': 1.0,
'colsample_bylevel': 0.5,
'subsample': 0.9,
'gamma': 0,
'lambda': 1,
'alpha': 0,
'min_child_weight': 1,
'tree_method': 'gpu_hist',
}
clf = xgb.XGBClassifier(**params)
clf.fit(
tr_x, tr_y,
eval_set=[(va_x, va_y)],
early_stopping_rounds=100,
verbose=100)
va_preds_p = clf.predict_proba(va_x)[:, 1]
oof_preds[
train.select(
pl.when(pl.col('folds')==fold).then(True).otherwise(False)
).to_numpy().reshape(-1)
] = va_preds_p
va_preds = (va_preds_p > 0.5).astype(int)
score = accuracy_score(va_y, va_preds)
print(f'Fold : {fold+1} Accuracy score: {score}')
print()
test_preds_p = clf.predict_proba(test_x)[:, 1]
preds.append(test_preds_p)
score_s = accuracy_score(train.select('y').to_numpy(), oof_preds > 0.5)
print(fontstyle.apply(f'Seed{seed} Accuracy score : {score_s}', 'BLACK/BOLD'))
print()
whole_va_preds.append(oof_preds)
whole_test_preds.append(preds)
# preds_va_p = np.mean(whole_va_preds, axis=0)
# whole_score = accuracy_score(train.select('y').to_numpy(), preds_va_p > 0.5)
# preds_test = (np.mean(np.mean(whole_test_preds, axis=0), axis=0) > 0.5).astype(int)
preds_va = np.array([np.where(preds > 0.5, 1, 0) for preds in whole_va_preds])
whole_score = accuracy_score(train.select('y').to_numpy(), stats.mode(preds_va, axis=0).mode.flatten())
test_preds_array = np.array(whole_test_preds)
test_preds_array = test_preds_array.reshape(test_preds_array.shape[0]*test_preds_array.shape[1], -1)
preds_test = np.array([np.where(preds > 0.5, 1, 0) for preds in test_preds_array])
preds_test = stats.mode(preds_test, axis=0).mode.flatten()
print()
print(fontstyle.apply(f'whole Accuracy score: {whole_score}', 'BLACK/BOLD'))
print()
display(pl.Series(preds_test).value_counts())
< Seed : 0 > Device: Tesla T4 Your runtime has 13.6 gigabytes of available RAM [0] validation_0-error:0.350754 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.296482 [200] validation_0-error:0.295477 Stopping. Best iteration: [115] validation_0-error:0.294472 Fold : 1 Accuracy score: 0.7055276381909548 [0] validation_0-error:0.384925 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.311558 Stopping. Best iteration: [38] validation_0-error:0.303518 Fold : 2 Accuracy score: 0.6964824120603015 [0] validation_0-error:0.377889 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.298492 Stopping. Best iteration: [23] validation_0-error:0.288442 Fold : 3 Accuracy score: 0.7115577889447237 [0] validation_0-error:0.359799 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.293467 Stopping. Best iteration: [38] validation_0-error:0.287437 Fold : 4 Accuracy score: 0.7125628140703517 [0] validation_0-error:0.360161 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.296781 Stopping. Best iteration: [70] validation_0-error:0.285714 Fold : 5 Accuracy score: 0.7142857142857143 Seed0 Accuracy score : 0.7080820265379976 < Seed : 1 > Device: Tesla T4 Your runtime has 13.6 gigabytes of available RAM [0] validation_0-error:0.366834 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.303518 Stopping. Best iteration: [22] validation_0-error:0.291457 Fold : 1 Accuracy score: 0.7085427135678392 [0] validation_0-error:0.39397 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.301508 Stopping. Best iteration: [37] validation_0-error:0.295477 Fold : 2 Accuracy score: 0.7045226130653266 [0] validation_0-error:0.356784 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.306533 Stopping. Best iteration: [42] validation_0-error:0.299497 Fold : 3 Accuracy score: 0.700502512562814 [0] validation_0-error:0.371859 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.288442 Stopping. Best iteration: [26] validation_0-error:0.280402 Fold : 4 Accuracy score: 0.7195979899497488 [0] validation_0-error:0.363179 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.300805 Stopping. Best iteration: [88] validation_0-error:0.295775 Fold : 5 Accuracy score: 0.704225352112676 Seed1 Accuracy score : 0.7074788902291917 < Seed : 2 > Device: Tesla T4 Your runtime has 13.6 gigabytes of available RAM [0] validation_0-error:0.366834 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.292462 [200] validation_0-error:0.300503 Stopping. Best iteration: [104] validation_0-error:0.290452 Fold : 1 Accuracy score: 0.7095477386934673 [0] validation_0-error:0.354774 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.307538 Stopping. Best iteration: [22] validation_0-error:0.302513 Fold : 2 Accuracy score: 0.6974874371859296 [0] validation_0-error:0.378894 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.302513 Stopping. Best iteration: [52] validation_0-error:0.295477 Fold : 3 Accuracy score: 0.7045226130653266 [0] validation_0-error:0.369849 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.294472 [200] validation_0-error:0.289447 [300] validation_0-error:0.287437 [400] validation_0-error:0.281407 Stopping. Best iteration: [384] validation_0-error:0.280402 Fold : 4 Accuracy score: 0.7195979899497488 [0] validation_0-error:0.366197 Will train until validation_0-error hasn't improved in 100 rounds. [100] validation_0-error:0.297787 Stopping. Best iteration: [11] validation_0-error:0.295775 Fold : 5 Accuracy score: 0.704225352112676 Seed2 Accuracy score : 0.7070767993566546 whole Accuracy score: 0.7143144350623241
counts | |
---|---|
i64 | u32 |
0 | 6132 |
1 | 261 |
# # Run LightGBM
# use_col = ['num_title', 'num_abstract', 'num_keywords'] + emb_col
# test_x = test.select(use_col).to_numpy()
# whole_va_preds = []
# whole_test_preds = []
# for seed in range(3):
# print(fontstyle.apply(f'< Seed : {seed} >', 'BLACK/BOLD'))
# set_seed(seed)
# train = train.with_columns(
# pl.Series(get_stratifiedkfold(train, 'group', num_fold, seed))
# .alias('folds')
# )
# oof_preds = np.zeros((len(train), ), dtype=np.float32)
# preds = []
# for fold in range(num_fold):
# tr_x = train.filter(pl.col('folds')!=fold).select(use_col).to_numpy()
# tr_y = train.filter(pl.col('folds')!=fold).select('y').to_numpy()
# va_x = train.filter(pl.col('folds')==fold).select(use_col).to_numpy()
# va_y = train.filter(pl.col('folds')==fold).select('y').to_numpy()
# params = {
# 'n_estimators' : 10000,
# 'learning_rate': 0.01,
# 'random_seed': seed,
# }
# clf = lgbm.LGBMClassifier(**params)
# clf.fit(
# tr_x, tr_y,
# eval_set=[(va_x, va_y)],
# early_stopping_rounds=100,
# verbose=100)
# va_preds_p = clf.predict_proba(va_x)[:, 1]
# oof_preds[
# train.select(
# pl.when(pl.col('folds')==fold).then(True).otherwise(False)
# ).to_numpy().reshape(-1)
# ] = va_preds_p
# va_preds = (va_preds_p > 0.5).astype(int)
# score = accuracy_score(va_y, va_preds)
# print(f'Fold : {fold+1} Accuracy score: {score}')
# print()
# test_preds_p = clf.predict_proba(test_x)[:, 1]
# preds.append(test_preds_p)
# score_s = accuracy_score(train.select('y').to_numpy(), oof_preds > 0.5)
# print(fontstyle.apply(f'Seed{seed} Accuracy score : {score_s}', 'BLACK/BOLD'))
# print()
# whole_va_preds.append(oof_preds)
# whole_test_preds.append(preds)
# # preds_va_p = np.mean(whole_va_preds, axis=0)
# # whole_score = accuracy_score(train.select('y').to_numpy(), preds_va_p > 0.5)
# # preds_test = (np.mean(np.mean(whole_test_preds, axis=0), axis=0) > 0.5).astype(int)
# preds_va = np.array([np.where(preds > 0.5, 1, 0) for preds in whole_va_preds])
# whole_score = accuracy_score(train.select('y').to_numpy(), stats.mode(preds_va, axis=0).mode.flatten())
# test_preds_array = np.array(whole_test_preds)
# test_preds_array = test_preds_array.reshape(test_preds_array.shape[0]*test_preds_array.shape[1], -1)
# preds_test = np.array([np.where(preds > 0.5, 1, 0) for preds in test_preds_array])
# preds_test = stats.mode(preds_test, axis=0).mode.flatten()
# print()
# print(fontstyle.apply(f'whole Accuracy score: {whole_score}', 'BLACK/BOLD'))
# print()
# display(pl.Series(preds_test).value_counts())