yshr10ic
# mount drive
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
# 必要なライブラリのインストール
!pip install -q transformers > /dev/null
# カレントディレクトリを変更
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/probspace/kiva/')
print(os.getcwd())
/content/drive/My Drive/Colab Notebooks/probspace/kiva
class Config():
root_path = './'
input_path = os.path.join(root_path, 'data')
output_path = os.path.join(root_path, 'output')
bert_model_name = 'bert-base-uncased'
seed = 42
debug = False
# create dirs
for dir in [Config.output_path]:
os.makedirs(dir, exist_ok=True)
import datetime
import itertools
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import random
import pickle
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
import torch
from tqdm import tqdm
# NLP
from sklearn.decomposition import PCA
import transformers
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# Model
import lightgbm as lgb
pd.set_option('max_columns', None)
pd.options.display.float_format = '{:.5f}'.format
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
def seed_everything(seed=2021):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
seed_everything(Config.seed)
train_df = pd.read_csv(os.path.join(Config.input_path, 'train.csv'))
if Config.debug:
train_df = train_df[:1000]
print(train_df.shape)
display(train_df)
(91333, 18)
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | LOAN_AMOUNT | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1733169 | English | Teodora is a 50-year-old married woman from th... | Teodora is a 50-year-old married woman from th... | 100 | 3115271 | Weaving | Arts | to purchase materials like nipa palm, bamboo ... | PH | Philippines | Maribojoc, Bohol | shared | 0.10000 | PHP | #Elderly | monthly | field_partner |
1 | 1546998 | English | Diego is 32 years old and lives in the municip... | Diego is 32 years old and lives in the municip... | 1350 | 2870403 | Barber Shop | Services | to buy two hair clippers, a new barber chair, ... | CO | Colombia | Apartadó | shared | 0.10000 | COP | user_favorite, user_favorite | monthly | field_partner |
2 | 1808517 | Spanish | Osman, es un joven de 27 años de edad, soltero... | Osman is a young man, 27 years old, single, an... | 225 | 3215705 | Farming | Agriculture | to purchase sacks of fertilizers to care for a... | HN | Honduras | Nueva Frontera, Santa Barbara. | shared | 0.10000 | HNL | NaN | bullet | field_partner |
3 | 1452940 | English | His name is Nino, 31 years old, married to Che... | His name is Nino, 31 years old, married to Che... | 350 | 2745031 | Motorcycle Transport | Transportation | to pay for fuel, tires and change oil for his ... | PH | Philippines | Silang, Cavite | shared | 0.10000 | PHP | user_favorite | monthly | field_partner |
4 | 1778420 | English | Pictured above is Teresa, often described as a... | Pictured above is Teresa, often described as a... | 625 | 3083800 | Farming | Agriculture | to purchase hybrid seeds and fertilizer to imp... | KE | Kenya | Mumias | shared | 0.10000 | KES | #Eco-friendly, #Sustainable Ag, #Parent, #Elde... | bullet | field_partner |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
91328 | 1688789 | Spanish | Rider tiene 20 años de edad, vive en San Javie... | Rider is 20 years old. He lives in San Javier,... | 775 | 3054018 | Poultry | Agriculture | to buy chickens to raise and sell. | EC | Ecuador | San Javier | shared | 0.10000 | USD | volunteer_like, #Animals, #Supporting Family | monthly | field_partner |
91329 | 1878119 | English | Carmelita works hard to support four children.... | Carmelita works hard to support four children.... | 100 | 3311100 | Personal Housing Expenses | Housing | to build a sanitary toilet for her family | PH | Philippines | Danao Cebu | standard | nan | PHP | volunteer_like | monthly | field_partner |
91330 | 1639680 | English | Orn, 60 years of age, appears in the photo. Sh... | Orn, 60 years of age, appears in the photo. Sh... | 1500 | 2990352 | Grocery Store | Food | to pay for additional groceries to stock the s... | KH | Cambodia | Takeo province | shared | 0.10000 | USD | user_favorite, #Elderly, user_favorite | monthly | field_partner |
91331 | 1495391 | Spanish | Walter, a sus 27 años de edad, vive en unión l... | At 27 years of age, Walter is in a live-in rel... | 1750 | 2805390 | Farming | Agriculture | to buy agricultural supplies, such as fertiliz... | CO | Colombia | El Carmen de Viboral | shared | 0.10000 | COP | #Sustainable Ag, #Eco-friendly, user_favorite | monthly | field_partner |
91332 | 1602898 | English | Greetings from Uganda! This is Godfrey. He is ... | Greetings from Uganda! This is Godfrey. He is ... | 275 | 2943724 | Education provider | Education | to purchase a water-filtration system to provi... | UG | Uganda | Isingiro | shared | 0.10000 | UGX | #Health and Sanitation, user_favorite, #School... | irregular | field_partner |
91333 rows × 18 columns
test_df = pd.read_csv(os.path.join(Config.input_path, 'test.csv'))
if Config.debug:
test_df = test_df[:1000]
print(test_df.shape)
display(test_df)
(91822, 17)
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2041445 | English | Marcela is 69 years old and married with ten c... | Marcela is 69 years old and married with ten c... | 4051101 | General Store | Retail | to buy items to sell like canned goods and per... | PH | Philippines | Cauayan, Negros Occidental | standard | nan | PHP | NaN | monthly | field_partner |
1 | 1944435 | English | Roselia is 48 years old and has five children.... | Roselia is 48 years old and has five children.... | 3410523 | Pigs | Agriculture | to buy feeds and other supplies to raise her pig | PH | Philippines | Guihulngan, Negros Oriental | standard | nan | PHP | #Animals, #Repeat Borrower, #Schooling, #Woman... | monthly | field_partner |
2 | 2083354 | English | Ma. Marebil is a single woman, 40 years old wi... | Ma. Marebil is a single woman, 40 years old wi... | 4146690 | Clothing Sales | Clothing | to buy additional stock of clothes and dresses... | PH | Philippines | Santa Barbara, Iloilo | standard | nan | PHP | #Parent, #Single Parent, #Woman-Owned Business | monthly | field_partner |
3 | 1993565 | English | Good day, lenders! Meet one of KBMI’s clients,... | Good day, lenders! Meet one of KBMI’s clients,... | 3945982 | Food | Food | to buy more foods to grow her business. | ID | Indonesia | Pandeglang | shared | 0.10000 | IDR | #Woman-Owned Business, #Schooling, #Elderly, #... | monthly | field_partner |
4 | 2064272 | English | Rosemarie is a married woman with two children... | Rosemarie is a married woman with two children... | 4114040 | Food | Food | to buy ingredients for her food production bus... | PH | Philippines | Sogod Cebu | standard | nan | PHP | NaN | monthly | field_partner |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
91817 | 1993862 | English | Marjorie is a resident of Tubigon, Bohol. She ... | Marjorie is a resident of Tubigon, Bohol. She ... | 3946629 | Fishing | Food | to buy fishing nets. | PH | Philippines | Tubigon, Bohol | shared | 0.00000 | PHP | #Parent, #Biz Durable Asset | monthly | field_partner |
91818 | 2015070 | English | Hello, Kiva community! Meet Janeth, a mother e... | Hello, Kiva community! Meet Janeth, a mother e... | 4006025 | Home Energy | Personal Use | to buy a solar lantern to provide adequate lig... | KE | Kenya | Nandi Hills | shared | 0.00000 | KES | #Technology, #Eco-friendly, #Parent | monthly | field_partner |
91819 | 1950349 | French | Agé de 32 ans, Komi est marié .C'est un bouche... | Komi is 32 years old and married. He is a reno... | 3423123 | Butcher Shop | Food | to buy two cows. | TG | Togo | Vakpossito | shared | 0.00000 | XOF | #Biz Durable Asset, user_favorite, #Animals | monthly | field_partner |
91820 | 1921580 | Russian | Калбубу, 56 лет, вдова, есть взрослые дети. У ... | Kalbubu is 56 years old, a widow, and she has ... | 3373358 | Dairy | Agriculture | to buy dairy cows to increase her headcount of... | KG | Kyrgyzstan | Min-Bulak village, Talas region | shared | 0.00000 | KGS | #Animals, #Widowed, #Biz Durable Asset, #Woman... | irregular | field_partner |
91821 | 1976733 | English | Hinrilyn is 31 years old and has two children.... | Hinrilyn is 31 years old and has two children.... | 3841884 | Pigs | Agriculture | to buy feeds and other supplies to raise her l... | PH | Philippines | Coron, Palawan | standard | nan | PHP | user_favorite, #Animals, #Parent, user_favorit... | monthly | field_partner |
91822 rows × 17 columns
columbia2131さんが以前に投稿されていたトピックを参考にしています。
[SciBERTを用いたtextデータの特徴量抽出](https://comp.probspace.com/competitions/citation_prediction/discussions/columbia2131-Post0cf3bc9feaa1640eee20)
class BertSequenceVectorizer:
"""
事前学習済み BERT モデルを使ったテキスト特徴抽出
"""
def __init__(self, model_name='bert-base-uncased', max_len=128):
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.model_name = model_name
self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name)
self.model = transformers.AutoModel.from_pretrained(self.model_name)
self.model = self.model.to(self.device)
self.max_len = max_len
def vectorize(self, sentence : str) -> np.array:
inp = self.tokenizer.encode(sentence)
len_inp = len(inp)
if len_inp >= self.max_len:
inputs = inp[:self.max_len]
masks = [1] * self.max_len
else:
inputs = inp + [0] * (self.max_len - len_inp)
masks = [1] * len_inp + [0] * (self.max_len - len_inp)
inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
output = self.model(inputs_tensor, masks_tensor)
seq_out = output['last_hidden_state']
if torch.cuda.is_available():
return seq_out[0][0].cpu().detach().numpy()
else:
return seq_out[0][0].detach().numpy()
def get_bert_feature(input_df):
vectorizer = BertSequenceVectorizer(model_name=Config.bert_model_name)
texts = input_df['DESCRIPTION_TRANSLATED'].fillna('')
text_vecs = np.array([vectorizer.vectorize(x) for x in texts])
pca = PCA(n_components=64)
text_vecs = pca.fit_transform(text_vecs)
output_df = pd.DataFrame(text_vecs, columns=[f'bert_pca_vecs={i:03}' for i in range(text_vecs.shape[1])])
output_df.index = input_df.index
return output_df
train_bert = get_bert_feature(train_df)
test_bert = get_bert_feature(test_df)
Downloading: 0%| | 0.00/28.0 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/570 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/226k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/455k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/420M [00:00<?, ?B/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias'] - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias'] - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
model_params = {
'n_estimators': 1000,
'objective': 'mae',
'learning_rate': 0.1,
'random_state': Config.seed,
'n_jobs': -1,
}
fit_params = {
'early_stopping_rounds': 100,
'verbose': False
}
def make_kf(X, y, n_splits=5):
kf = KFold(n_splits=n_splits, shuffle=True, random_state=Config.seed)
return list(kf.split(X))
def train_cv(X, y, model, model_params, fit_params, cv, folds):
oof = []; va_idxes = []; scores = []; models = {}
train_x, train_y = X.values, y.values
fold_idx = cv(train_x, train_y, n_splits=folds)
for fold, (tr_idx, va_idx) in enumerate(fold_idx):
tr_x, va_x = train_x[tr_idx], train_x[va_idx]
tr_y, va_y = train_y[tr_idx], train_y[va_idx]
va_idxes.append(va_idx)
est = model(**model_params)
est.fit(tr_x, np.log1p(tr_y),
eval_set=[[va_x, np.log1p(va_y)]],
**fit_params)
model_name = f'LGBM_FOLD{fold}'
models[model_name] = est
preds = est.predict(va_x)
preds = np.expm1(preds)
oof.append(preds)
score = mean_absolute_error(va_y, preds)
scores.append(score)
print(f'FOLD: {fold}, SCORE: {score:.5f}')
va_idxes = np.concatenate(va_idxes)
oof = np.concatenate(oof)
order = np.argsort(va_idxes)
oof = oof[order]
oof_score = mean_absolute_error(train_y, oof)
print(f'oof score: {oof_score:.5f}\n')
return oof, models
oof, models = train_cv(train_bert, train_df['LOAN_AMOUNT'],
lgb.LGBMModel,
model_params,
fit_params,
cv=make_kf,
folds=5)
FOLD: 0, SCORE: 359.82291 FOLD: 1, SCORE: 368.42584 FOLD: 2, SCORE: 362.28032 FOLD: 3, SCORE: 374.44010 FOLD: 4, SCORE: 362.43647 oof score: 365.48106
def predict(test_x, models):
preds = []
for i, (name, est) in enumerate(models.items()):
print(f'{name}')
preds.append(est.predict(test_x.values))
preds = np.mean(preds, axis=0)
preds = np.expm1(preds)
return preds
preds = predict(test_bert, models)
LGBM_FOLD0 LGBM_FOLD1 LGBM_FOLD2 LGBM_FOLD3 LGBM_FOLD4
submit_df = pd.DataFrame({
'LOAN_ID': test_df['LOAN_ID'],
'LOAN_AMOUNT': preds.reshape(-1)
})
display(submit_df)
LOAN_ID | LOAN_AMOUNT | |
---|---|---|
0 | 2041445 | 408.82997 |
1 | 1944435 | 516.58871 |
2 | 2083354 | 366.54677 |
3 | 1993565 | 1370.62896 |
4 | 2064272 | 331.34300 |
... | ... | ... |
91817 | 1993862 | 586.39110 |
91818 | 2015070 | 588.63327 |
91819 | 1950349 | 623.28754 |
91820 | 1921580 | 976.51682 |
91821 | 1976733 | 433.89723 |
91822 rows × 2 columns
submit_df.to_csv(os.path.join(Config.output_path, f'submission_bert_tutorial.csv'), index=False)
maruyama
CVとLBが乖離しているのは、PCAをtrainとtestで別々にかけているからではないでしょうか。 trainでfitしたPCAのモデルを使ってtestをtransformしないと、trainとtestで異なる空間へ写像されてしまうと思います。
goukaisei
貴重なトピックありがとうございます! Bert の max_len ってとりあえず文章の最大に合わせておけばいいのかと思ったんですが、そうでもないんですかね?