skywalker
import warnings
warnings.filterwarnings('ignore')
import os
import re
import gc
import sys
import time
import glob
import math
import json
import pickle
import joblib
import random
import string
import psutil
from pathlib import Path
from datetime import datetime
from tqdm.autonotebook import tqdm
from contextlib import contextmanager
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from texthero import preprocessing as hero_prep
import texthero as hero
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import japanize_matplotlib
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import BertTokenizer, BertModel
def seed_everything(seed:int==42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
INPUT_DIR = '/mnt/work/data/ProbSpace'
OUTPUT_DIR = f'../data/interim/{datetime.now():%Y_%m_%d}/'
MODEL_DIR = f'../models/{datetime.now():%Y_%m_%d}/'
# DATA_DIR = '/mnt/work/data/ProbSpace/shimizu/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
print(f'保存先: {OUTPUT_DIR}')
print(f'モデル保存先: {MODEL_DIR}')
保存先: ../data/interim/2021_03_04/ モデル保存先: ../models/2021_03_04/
@contextmanager
def timer(name:str, slack:bool=True):
t0 = time.time()
p = psutil.Process(os.getpid())
m0 = p.memory_info()[0] / 2. ** 30
print(f'<< {name} >> Start')
yield
m1 = p.memory_info()[0] / 2. ** 30
delta = m1 - m0
sign = '+' if delta >= 0 else '-'
delta = math.fabs(delta)
print(f"<< {name} >> {m1:.1f}GB({sign}{delta:.1f}GB):{time.time() - t0:.1f}sec", file=sys.stderr)
class CONFIG:
SEED = 42
NFOLDS = 7
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
def Load_rowjson(file_path:str, cols:list, train:bool=True) -> pd.DataFrame:
df = pd.DataFrame(columns = cols)
with open(file_path, 'r') as f:
if train==True:
for i, json_line in tqdm(enumerate(f)):
data = json.loads(json_line)
if 'cites' in data:
for col in cols:
df.at[i, col] = data[col]
else:
continue
else:
cols.remove('cites')
for i, json_line in tqdm(enumerate(f)):
data = json.loads(json_line)
for col in cols:
df.at[i, col] = data[col]
df = df.reset_index(drop=True)
return df
with timer('Load Data'):
df_cols = ["title", "abstract", "doi_cites", "cites"]
train_df = Load_rowjson(os.path.join(INPUT_DIR, 'train_data.json'), df_cols, train=True)
test_df = pd.read_json(os.path.join(INPUT_DIR, 'test_data.json'), lines=True)
test_df = test_df[["title", "abstract", "doi_cites"]]
print('train_df', train_df.shape)
print('test_df', test_df.shape)
<< Load Data >> Start
0it [00:00, ?it/s]
train_df (15117, 4) test_df (59084, 3) << Load Data >> 1.6GB(+0.5GB):61.1sec
# doi_citesは対数変換
train_df['cites'] = np.log1p(train_df['cites'].astype('float'))
train_df['doi_cites'] = np.log1p(train_df['doi_cites'].astype('float'))
test_df['doi_cites'] = np.log1p(test_df['doi_cites'].astype('float'))
with timer('Data Merge'):
# 後で分離しやすいように、flag追加
train_df['flag'] = 0
test_df['flag'] = 1
whole_df = pd.concat([train_df,
test_df],axis=0).reset_index(drop=True)
<< Data Merge >> Start << Data Merge >> 1.6GB(+0.0GB):0.0sec
def title_embedding(df:pd.DataFrame, pipeline:list, col:str='title') -> np.array:
title_transformer_mean = []
title_transformer_max = []
## 正規表現
tmp = pd.DataFrame(hero.clean(df[col], pipeline))
# BERT Model
## GPU使用した方が速い
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device: {DEVICE}')
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.to(DEVICE)
for idx, rows in tqdm(tmp.iterrows(), total=len(tmp)):
title = rows[col]
# Transformer
input_ids = torch.tensor(tokenizer.encode(title, add_special_tokens=True)).unsqueeze(0)
outputs = model(input_ids.to(DEVICE))
emb_transformer = outputs[0].cpu().detach().numpy()
emb_transformer_mean = np.mean(emb_transformer, axis=1)[0, :]
emb_transformer_max = np.max(emb_transformer, axis=1)[0, :]
title_transformer_mean.append(emb_transformer_mean)
title_transformer_max.append(emb_transformer_max)
title_transformer_mean = np.array(title_transformer_mean)
title_transformer_max = np.array(title_transformer_max)
print(title_transformer_mean.shape, title_transformer_max.shape)
title_feat = np.concatenate([title_transformer_mean,
title_transformer_max],
axis=1)
del title_transformer_mean, title_transformer_max
_ = gc.collect()
return title_feat
title_pipeline = [
hero_prep.remove_diacritics,
hero_prep.lowercase,
hero_prep.remove_stopwords,
hero_prep.tokenize
]
with timer('Title Embedding'):
print('---- Train ----')
train_title = title_embedding(train_df,
title_pipeline,
'title')
print('---- Test ----')
test_title = title_embedding(test_df,
title_pipeline,
'title')
<< Title Embedding >> Start ---- Train ---- device: cuda
0%| | 0/15117 [00:00<?, ?it/s]
(15117, 768) (15117, 768) ---- Test ---- device: cuda
0%| | 0/59084 [00:00<?, ?it/s]
(59084, 768) (59084, 768) << Title Embedding >> 4.1GB(+0.6GB):2102.7sec
def abstract_embedding(df:pd.DataFrame, pipe_list:list, col:str='abstract') -> np.array:
abstract_transformer_mean = []
abstract_transformer_max = []
## 正規表現
tmp = pd.DataFrame(hero.clean(df[col], pipe_list))
# BERT Model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device: {DEVICE}')
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.to(DEVICE)
# 特徴抽出
for idx, rows in tqdm(tmp.iterrows(), total=len(tmp)):
abstract = rows[col]
# Transformer
input_ids = torch.tensor(tokenizer.encode(abstract, add_special_tokens=True)).unsqueeze(0)
outputs = model(input_ids.to(DEVICE))
emb_transformer = outputs[0].cpu().detach().numpy()
emb_transformer_mean = np.mean(emb_transformer, axis=1)[0, :]
emb_transformer_max = np.max(emb_transformer, axis=1)[0, :]
abstract_transformer_mean.append(emb_transformer_mean)
abstract_transformer_max.append(emb_transformer_max)
abstract_transformer_mean = np.array(abstract_transformer_mean)
abstract_transformer_max = np.array(abstract_transformer_max)
print(abstract_transformer_mean.shape,
abstract_transformer_max.shape)
abstract_feat = np.concatenate([abstract_transformer_mean,
abstract_transformer_max],
axis=1)
del abstract_transformer_mean, abstract_transformer_max
_ = gc.collect()
return abstract_feat
abstract_pipeline = [
hero_prep.remove_diacritics,
hero_prep.lowercase,
hero_prep.remove_stopwords,
hero_prep.tokenize
]
with timer('abstract Embedding'):
print('---- Train ----')
train_abstract = abstract_embedding(train_df,
abstract_pipeline,
'abstract')
print('---- Test ----')
test_abstract = abstract_embedding(test_df,
abstract_pipeline,
'abstract')
print(train_abstract.shape, test_abstract.shape)
<< abstract Embedding >> Start ---- Train ---- device: cuda
0%| | 0/15117 [00:00<?, ?it/s]
(15117, 768) (15117, 768) ---- Test ---- device: cuda
0%| | 0/59084 [00:00<?, ?it/s]
(59084, 768) (59084, 768) (15117, 1536) (59084, 1536) << abstract Embedding >> 4.5GB(+0.4GB):2486.3sec
train_doi = train_df['doi_cites'].astype('float').values
test_doi = test_df['doi_cites'].astype('float').values
target = train_df['cites'].astype('float').values
train_doi = np.reshape(train_doi, (-1, 1))
test_doi = np.reshape(test_doi, (-1, 1))
target = np.reshape(target, (-1, 1))
print('----- Train -----')
print(f'title:{train_title.shape} | abstract:{train_abstract.shape} | doi:{train_doi.shape} | target:{target.shape}')
print('----- Test -----')
print(f'title:{test_title.shape} | abstract:{test_abstract.shape} | doi:{test_doi.shape}')
----- Train ----- title:(15117, 1536) | abstract:(15117, 1536) | doi:(15117, 1) | target:(15117, 1) ----- Test ----- title:(59084, 1536) | abstract:(59084, 1536) | doi:(59084, 1)
from sklearn.decomposition import PCA
def detect_component(train_array:np.array, test_array:np.array, th:float=0.9) -> int:
whole_array = np.concatenate([train_array, test_array], axis=0)
pca = PCA(n_components=whole_array.shape[1])
pca.fit(whole_array)
plt.figure(figsize=(20, 5))
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)
plt.show()
tmp = pd.DataFrame(pca.explained_variance_ratio_, columns=['寄与率'])
tmp = tmp.cumsum()
tmp = tmp[tmp['寄与率']>th]
print(f'累積寄与率 {th*100:.1f}% を超える次元数 {tmp.index.tolist()[0]}')
return tmp.index.tolist()[0]
title_component = detect_component(train_title, test_title, 0.9)
abs_component = detect_component(train_abstract, test_abstract, 0.9)
累積寄与率 90.0% を超える次元数 377
累積寄与率 90.0% を超える次元数 539
def pca_trans(train:np.array, test:np.array, comp:int) -> (np.array, np.array):
pca = PCA(n_components=comp, random_state=CONFIG.SEED)
pca.fit(np.concatenate([train, test], axis=0))
train_pca = pca.transform(train)
test_pca = pca.transform(test)
return train_pca, test_pca
with timer('PCA Transform'):
train_title_pca, test_title_pca = pca_trans(train_title, test_title, title_component)
train_abs_pca, test_abs_pca = pca_trans(train_abstract, test_abstract, abs_component)
print(train_title_pca.shape, test_title_pca.shape)
print(train_abs_pca.shape, test_abs_pca.shape)
<< PCA Transform >> Start (15117, 377) (59084, 377) (15117, 539) (59084, 539) << PCA Transform >> 5.3GB(+0.3GB):70.2sec
class TrainDataset:
def __init__(self, features_1, features_2, target_doi, target_cites):
self.features_1 = features_1
self.features_2 = features_2
self.target_doi = target_doi
self.target_cites = target_cites
def __len__(self):
return self.features_1.shape[0]
def __getitem__(self, idx):
dct = {
'x_1' : torch.tensor(self.features_1[idx, :],
dtype=torch.float),
'x_2' : torch.tensor(self.features_2[idx, :],
dtype=torch.float),
'doi' : torch.tensor(self.target_doi[idx, :],
dtype=torch.float),
'cites' : torch.tensor(self.target_cites[idx, :],
dtype=torch.float),
}
return dct
class TestDataset:
def __init__(self, features_1, features_2, target_doi):
self.features_1 = features_1
self.features_2 = features_2
self.target_doi = target_doi
def __len__(self):
return self.features_1.shape[0]
def __getitem__(self, idx):
dct = {
'x_1' : torch.tensor(self.features_1[idx, :],
dtype=torch.float),
'x_2' : torch.tensor(self.features_2[idx, :],
dtype=torch.float),
'doi' : torch.tensor(self.target_doi[idx, :],
dtype=torch.float),
}
return dct
def train_func(model, optimizer, scheduler, loss_fn, dataloader, device, epoch):
model.train()
final_loss = 0
total_steps = EPOCHS * len(dataloader)
for step, data in enumerate(dataloader):
optimizer.zero_grad()
title = data['x_1'].to(device)
abstract = data['x_2'].to(device)
doi = data['doi'].to(device)
targets = data['cites'].to(device)
output_cites = model(title, abstract, doi)
loss = loss_fn(output_cites, targets)
loss.backward()
optimizer.step()
scheduler.step()
final_loss += loss.item()
final_loss /= len(dataloader)
return final_loss
def valid_func(model, loss_fn, dataloader, device):
model.eval()
final_loss = 0
valid_cites_preds = []
for step, data in enumerate(dataloader):
title = data['x_1'].to(device)
abstract = data['x_2'].to(device)
doi = data['doi'].to(device)
targets = data['cites'].to(device)
output_cites = model(title, abstract, doi)
loss = loss_fn(output_cites, targets)
final_loss += loss.item()
valid_cites_preds.append(output_cites.detach().cpu().numpy())
final_loss /= len(dataloader)
valid_cites_preds = np.concatenate(valid_cites_preds)
return final_loss, valid_cites_preds
def test_func(model, dataloader, device):
model.eval()
preds = []
for data in dataloader:
title = data['x_1'].to(device)
abstract = data['x_2'].to(device)
doi = data['doi'].to(device)
with torch.no_grad():
outputs = model(title, abstract, doi)
preds.append(outputs.detach().cpu().numpy())
preds = np.concatenate(preds)
return preds
class TwoHeadModel(nn.Module):
def __init__(self, num_features_1, num_features_2, num_doi, num_cites, hidden_size):
super(TwoHeadModel, self).__init__()
self.batch_norm_1 = nn.BatchNorm1d(num_features_1)
self.dense_1 = nn.utils.weight_norm(nn.Linear(num_features_1,
hidden_size))
self.activation_1 = torch.nn.PReLU(num_parameters=hidden_size,
init=1.0)
self.batch_norm_2 = nn.BatchNorm1d(num_features_2)
self.dense_2 = nn.utils.weight_norm(nn.Linear(num_features_2,
hidden_size))
self.activation_2 = torch.nn.PReLU(num_parameters=hidden_size, init=1.0)
self.batch_norm_3 = nn.BatchNorm1d(hidden_size*2 + num_doi)
self.dropout_3 = nn.Dropout(0.3)
self.dense_3 = nn.utils.weight_norm(nn.Linear(hidden_size*2 + num_doi,
num_cites))
def forward(self, x1, x2, x3):
x1 = self.batch_norm_1(x1)
x1 = self.activation_1(self.dense_1(x1))
x2 = self.batch_norm_2(x2)
x2 = self.activation_2(self.dense_2(x2))
x = torch.cat([x1, x2, x3], dim=1)
x = self.batch_norm_3(x)
x = self.dropout_3(x)
x = self.dense_3(x)
return x
## RMSLEで学習させる
def RMSELoss(yhat,y):
return torch.sqrt(torch.mean((yhat-y)**2))
## Cross Validationはdoi_citesの整数部分でStratifiedKFold
DOI_INT = pd.Series(train_df['cites'].astype('float')).astype(int)
plt.hist(DOI_INT, bins=DOI_INT.nunique())
plt.show()
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 20
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
num_features_1 = train_title_pca.shape[1]
num_features_2 = train_abs_pca.shape[1]
num_doi = train_doi.shape[1]
num_cites = target.shape[1]
print(f'num_features_1: {num_features_1} | num_features_2: {num_features_2}')
print(f'num_doi: {num_doi} | num_cites: {num_cites}')
hidden_size = 128
CV = StratifiedKFold(n_splits=CONFIG.NFOLDS,
shuffle=True,
random_state=CONFIG.SEED)
num_features_1: 377 | num_features_2: 539 num_doi: 1 | num_cites: 1
oof = np.zeros((train_df.shape[0]))
preds = np.zeros((test_df.shape[0]))
for fold, (tr, te) in enumerate(CV.split(train_df, DOI_INT)):
print('★'*40)
print(f'Fold: {fold+1}')
X_title_tr = train_title_pca[tr]
X_title_te = train_title_pca[te]
X_abstract_tr = train_abs_pca[tr]
X_abstract_te = train_abs_pca[te]
X_doi_tr = train_doi[tr]
X_doi_te = train_doi[te]
y_tr = target[tr]
y_te = target[te]
train_dataset = TrainDataset(X_title_tr, X_abstract_tr, X_doi_tr, y_tr)
valid_dataset = TrainDataset(X_title_te, X_abstract_te, X_doi_te, y_te)
trainloader = torch.utils.data.DataLoader(train_dataset,
batch_size=BATCH_SIZE,
shuffle=True)
validloader = torch.utils.data.DataLoader(valid_dataset,
batch_size=BATCH_SIZE,
shuffle=False)
model = TwoHeadModel(
num_features_1 = num_features_1,
num_features_2 = num_features_2,
num_doi = num_doi,
num_cites = num_cites,
hidden_size = hidden_size,
)
model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(),
lr=LEARNING_RATE,
weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
pct_start=0.1,
div_factor=1e3,
max_lr=1e-2,
epochs=EPOCHS,
steps_per_epoch=len(trainloader))
loss_fn = RMSELoss
best_loss = np.inf
for epoch in range(EPOCHS):
start_time = time.time()
train_loss = train_func(model,
optimizer,
scheduler,
loss_fn,
trainloader,
DEVICE,
epoch)
valid_loss, valid_cites_preds = valid_func(model,
loss_fn,
validloader,
DEVICE)
end_time = time.time()
print(f"FOLD: {fold+1} | EPOCH:{epoch+1:02d} | train_loss:{train_loss:.6f} | valid_loss:{valid_loss:.6f} | time:{end_time-start_time:.1f}s ")
if valid_loss < best_loss:
best_loss = valid_loss
oof[te] = valid_cites_preds[:, 0]
torch.save(model.state_dict(),
f"{MODEL_DIR}SimpleMLP_{fold+1}.pth")
else:
continue
testdataset = TestDataset(test_title_pca, test_abs_pca, test_doi)
testloader = torch.utils.data.DataLoader(testdataset,
batch_size=BATCH_SIZE,
shuffle=False)
model = TwoHeadModel(
num_features_1 = num_features_1,
num_features_2 = num_features_2,
num_doi = num_doi,
num_cites = num_cites,
hidden_size = hidden_size,
)
model.load_state_dict(torch.load(f"{MODEL_DIR}SimpleMLP_{fold+1}.pth"))
model.to(DEVICE)
preds += test_func(model, testloader, DEVICE)[:, 0]/NFOLDS
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ Fold: 1 FOLD: 1 | EPOCH:01 | train_loss:2.346136 | valid_loss:1.039456 | time:3.7s FOLD: 1 | EPOCH:02 | train_loss:0.917533 | valid_loss:0.683612 | time:3.6s FOLD: 1 | EPOCH:03 | train_loss:0.863786 | valid_loss:0.680757 | time:3.7s FOLD: 1 | EPOCH:04 | train_loss:0.835902 | valid_loss:0.662268 | time:3.3s FOLD: 1 | EPOCH:05 | train_loss:0.830839 | valid_loss:0.646799 | time:3.6s FOLD: 1 | EPOCH:06 | train_loss:0.819551 | valid_loss:0.679971 | time:3.7s FOLD: 1 | EPOCH:07 | train_loss:0.824818 | valid_loss:0.669431 | time:3.4s FOLD: 1 | EPOCH:08 | train_loss:0.823782 | valid_loss:0.695271 | time:3.6s FOLD: 1 | EPOCH:09 | train_loss:0.826026 | valid_loss:0.667164 | time:3.8s FOLD: 1 | EPOCH:10 | train_loss:0.810591 | valid_loss:0.679541 | time:3.5s FOLD: 1 | EPOCH:11 | train_loss:0.819718 | valid_loss:0.681857 | time:2.8s FOLD: 1 | EPOCH:12 | train_loss:0.800127 | valid_loss:0.672954 | time:2.3s FOLD: 1 | EPOCH:13 | train_loss:0.787671 | valid_loss:0.687458 | time:2.5s FOLD: 1 | EPOCH:14 | train_loss:0.780256 | valid_loss:0.701777 | time:4.1s FOLD: 1 | EPOCH:15 | train_loss:0.752611 | valid_loss:0.711099 | time:3.8s FOLD: 1 | EPOCH:16 | train_loss:0.725169 | valid_loss:0.733798 | time:3.6s FOLD: 1 | EPOCH:17 | train_loss:0.701010 | valid_loss:0.738370 | time:3.6s FOLD: 1 | EPOCH:18 | train_loss:0.690245 | valid_loss:0.758462 | time:3.4s FOLD: 1 | EPOCH:19 | train_loss:0.676085 | valid_loss:0.758223 | time:3.8s FOLD: 1 | EPOCH:20 | train_loss:0.682744 | valid_loss:0.757722 | time:3.4s ★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ Fold: 2 FOLD: 2 | EPOCH:01 | train_loss:2.268256 | valid_loss:0.951040 | time:3.5s FOLD: 2 | EPOCH:02 | train_loss:0.923570 | valid_loss:0.699678 | time:3.4s FOLD: 2 | EPOCH:03 | train_loss:0.849738 | valid_loss:0.681367 | time:3.6s FOLD: 2 | EPOCH:04 | train_loss:0.840315 | valid_loss:0.675227 | time:3.1s FOLD: 2 | EPOCH:05 | train_loss:0.832623 | valid_loss:0.672421 | time:2.3s FOLD: 2 | EPOCH:06 | train_loss:0.822668 | valid_loss:0.678278 | time:2.5s FOLD: 2 | EPOCH:07 | train_loss:0.822654 | valid_loss:0.685254 | time:4.0s FOLD: 2 | EPOCH:08 | train_loss:0.812432 | valid_loss:0.658895 | time:3.5s FOLD: 2 | EPOCH:09 | train_loss:0.807763 | valid_loss:0.682751 | time:3.3s FOLD: 2 | EPOCH:10 | train_loss:0.804930 | valid_loss:0.687398 | time:3.3s FOLD: 2 | EPOCH:11 | train_loss:0.800834 | valid_loss:0.687234 | time:3.4s FOLD: 2 | EPOCH:12 | train_loss:0.786160 | valid_loss:0.708299 | time:3.2s FOLD: 2 | EPOCH:13 | train_loss:0.767990 | valid_loss:0.704966 | time:3.7s FOLD: 2 | EPOCH:14 | train_loss:0.757462 | valid_loss:0.721444 | time:3.8s FOLD: 2 | EPOCH:15 | train_loss:0.738536 | valid_loss:0.744819 | time:3.2s FOLD: 2 | EPOCH:16 | train_loss:0.719331 | valid_loss:0.748718 | time:3.5s FOLD: 2 | EPOCH:17 | train_loss:0.691645 | valid_loss:0.771285 | time:3.4s FOLD: 2 | EPOCH:18 | train_loss:0.684977 | valid_loss:0.778157 | time:2.8s FOLD: 2 | EPOCH:19 | train_loss:0.680941 | valid_loss:0.774300 | time:2.3s FOLD: 2 | EPOCH:20 | train_loss:0.665683 | valid_loss:0.774228 | time:2.7s ★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ Fold: 3 FOLD: 3 | EPOCH:01 | train_loss:2.292497 | valid_loss:0.962321 | time:4.3s FOLD: 3 | EPOCH:02 | train_loss:0.924057 | valid_loss:0.701687 | time:3.9s FOLD: 3 | EPOCH:03 | train_loss:0.854219 | valid_loss:0.699506 | time:3.9s FOLD: 3 | EPOCH:04 | train_loss:0.837900 | valid_loss:0.684892 | time:3.5s FOLD: 3 | EPOCH:05 | train_loss:0.835563 | valid_loss:0.665736 | time:3.7s FOLD: 3 | EPOCH:06 | train_loss:0.827823 | valid_loss:0.665580 | time:3.5s FOLD: 3 | EPOCH:07 | train_loss:0.810266 | valid_loss:0.675681 | time:3.9s FOLD: 3 | EPOCH:08 | train_loss:0.813502 | valid_loss:0.681831 | time:4.0s FOLD: 3 | EPOCH:09 | train_loss:0.811910 | valid_loss:0.671052 | time:3.9s FOLD: 3 | EPOCH:10 | train_loss:0.804153 | valid_loss:0.670744 | time:3.6s FOLD: 3 | EPOCH:11 | train_loss:0.813063 | valid_loss:0.687331 | time:3.4s FOLD: 3 | EPOCH:12 | train_loss:0.797916 | valid_loss:0.707993 | time:4.3s FOLD: 3 | EPOCH:13 | train_loss:0.794149 | valid_loss:0.716200 | time:4.6s FOLD: 3 | EPOCH:14 | train_loss:0.763399 | valid_loss:0.712297 | time:4.5s FOLD: 3 | EPOCH:15 | train_loss:0.750802 | valid_loss:0.730693 | time:4.1s FOLD: 3 | EPOCH:16 | train_loss:0.717734 | valid_loss:0.743486 | time:3.6s FOLD: 3 | EPOCH:17 | train_loss:0.695795 | valid_loss:0.761950 | time:3.6s FOLD: 3 | EPOCH:18 | train_loss:0.681331 | valid_loss:0.768960 | time:3.5s FOLD: 3 | EPOCH:19 | train_loss:0.670938 | valid_loss:0.774660 | time:3.5s FOLD: 3 | EPOCH:20 | train_loss:0.674210 | valid_loss:0.769835 | time:3.6s ★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ Fold: 4 FOLD: 4 | EPOCH:01 | train_loss:2.253462 | valid_loss:0.955478 | time:3.6s FOLD: 4 | EPOCH:02 | train_loss:0.922848 | valid_loss:0.720584 | time:3.5s FOLD: 4 | EPOCH:03 | train_loss:0.850577 | valid_loss:0.666364 | time:3.6s FOLD: 4 | EPOCH:04 | train_loss:0.829753 | valid_loss:0.694528 | time:3.6s FOLD: 4 | EPOCH:05 | train_loss:0.829714 | valid_loss:0.688441 | time:3.6s FOLD: 4 | EPOCH:06 | train_loss:0.811148 | valid_loss:0.667915 | time:3.6s FOLD: 4 | EPOCH:07 | train_loss:0.816485 | valid_loss:0.677920 | time:3.5s FOLD: 4 | EPOCH:08 | train_loss:0.808134 | valid_loss:0.680463 | time:3.5s FOLD: 4 | EPOCH:09 | train_loss:0.811411 | valid_loss:0.679146 | time:3.6s FOLD: 4 | EPOCH:10 | train_loss:0.818291 | valid_loss:0.691657 | time:3.6s FOLD: 4 | EPOCH:11 | train_loss:0.807316 | valid_loss:0.692099 | time:3.7s FOLD: 4 | EPOCH:12 | train_loss:0.798671 | valid_loss:0.691802 | time:3.6s FOLD: 4 | EPOCH:13 | train_loss:0.778866 | valid_loss:0.697141 | time:3.6s FOLD: 4 | EPOCH:14 | train_loss:0.765021 | valid_loss:0.715104 | time:3.5s FOLD: 4 | EPOCH:15 | train_loss:0.746003 | valid_loss:0.736405 | time:3.7s FOLD: 4 | EPOCH:16 | train_loss:0.724310 | valid_loss:0.744852 | time:3.6s FOLD: 4 | EPOCH:17 | train_loss:0.695327 | valid_loss:0.749416 | time:3.5s FOLD: 4 | EPOCH:18 | train_loss:0.695899 | valid_loss:0.771669 | time:3.5s FOLD: 4 | EPOCH:19 | train_loss:0.685250 | valid_loss:0.772929 | time:3.5s FOLD: 4 | EPOCH:20 | train_loss:0.670131 | valid_loss:0.772308 | time:3.5s ★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ Fold: 5 FOLD: 5 | EPOCH:01 | train_loss:2.285175 | valid_loss:0.945952 | time:3.5s FOLD: 5 | EPOCH:02 | train_loss:0.927173 | valid_loss:0.718184 | time:3.5s FOLD: 5 | EPOCH:03 | train_loss:0.847746 | valid_loss:0.673975 | time:3.5s FOLD: 5 | EPOCH:04 | train_loss:0.837297 | valid_loss:0.669784 | time:3.7s FOLD: 5 | EPOCH:05 | train_loss:0.821963 | valid_loss:0.665123 | time:3.6s FOLD: 5 | EPOCH:06 | train_loss:0.819732 | valid_loss:0.669295 | time:3.4s FOLD: 5 | EPOCH:07 | train_loss:0.811684 | valid_loss:0.677448 | time:3.5s FOLD: 5 | EPOCH:08 | train_loss:0.820014 | valid_loss:0.672548 | time:3.5s FOLD: 5 | EPOCH:09 | train_loss:0.810965 | valid_loss:0.678159 | time:3.5s FOLD: 5 | EPOCH:10 | train_loss:0.799571 | valid_loss:0.690159 | time:3.5s FOLD: 5 | EPOCH:11 | train_loss:0.807466 | valid_loss:0.679499 | time:3.8s FOLD: 5 | EPOCH:12 | train_loss:0.787137 | valid_loss:0.693354 | time:3.7s FOLD: 5 | EPOCH:13 | train_loss:0.784178 | valid_loss:0.713389 | time:4.0s FOLD: 5 | EPOCH:14 | train_loss:0.766159 | valid_loss:0.709375 | time:2.4s FOLD: 5 | EPOCH:15 | train_loss:0.747560 | valid_loss:0.723371 | time:3.0s FOLD: 5 | EPOCH:16 | train_loss:0.721054 | valid_loss:0.735085 | time:2.9s FOLD: 5 | EPOCH:17 | train_loss:0.709306 | valid_loss:0.748228 | time:2.7s FOLD: 5 | EPOCH:18 | train_loss:0.700590 | valid_loss:0.756795 | time:2.6s FOLD: 5 | EPOCH:19 | train_loss:0.691556 | valid_loss:0.750856 | time:2.3s FOLD: 5 | EPOCH:20 | train_loss:0.694626 | valid_loss:0.746628 | time:2.5s ★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ Fold: 6 FOLD: 6 | EPOCH:01 | train_loss:2.284253 | valid_loss:0.937912 | time:2.3s FOLD: 6 | EPOCH:02 | train_loss:0.919130 | valid_loss:0.699395 | time:2.3s FOLD: 6 | EPOCH:03 | train_loss:0.848985 | valid_loss:0.688490 | time:2.3s FOLD: 6 | EPOCH:04 | train_loss:0.838046 | valid_loss:0.695203 | time:2.2s FOLD: 6 | EPOCH:05 | train_loss:0.828637 | valid_loss:0.691719 | time:2.4s FOLD: 6 | EPOCH:06 | train_loss:0.823461 | valid_loss:0.684958 | time:2.4s FOLD: 6 | EPOCH:07 | train_loss:0.807477 | valid_loss:0.686002 | time:2.4s FOLD: 6 | EPOCH:08 | train_loss:0.814183 | valid_loss:0.696132 | time:2.5s FOLD: 6 | EPOCH:09 | train_loss:0.801512 | valid_loss:0.703734 | time:2.4s FOLD: 6 | EPOCH:10 | train_loss:0.813860 | valid_loss:0.703319 | time:2.3s FOLD: 6 | EPOCH:11 | train_loss:0.798274 | valid_loss:0.686019 | time:2.4s FOLD: 6 | EPOCH:12 | train_loss:0.798365 | valid_loss:0.698568 | time:2.3s FOLD: 6 | EPOCH:13 | train_loss:0.777323 | valid_loss:0.708754 | time:2.2s FOLD: 6 | EPOCH:14 | train_loss:0.765338 | valid_loss:0.711898 | time:2.2s FOLD: 6 | EPOCH:15 | train_loss:0.747511 | valid_loss:0.738144 | time:2.2s FOLD: 6 | EPOCH:16 | train_loss:0.726277 | valid_loss:0.749122 | time:2.2s FOLD: 6 | EPOCH:17 | train_loss:0.704968 | valid_loss:0.751241 | time:2.2s FOLD: 6 | EPOCH:18 | train_loss:0.690608 | valid_loss:0.755787 | time:2.2s FOLD: 6 | EPOCH:19 | train_loss:0.670668 | valid_loss:0.765387 | time:2.2s FOLD: 6 | EPOCH:20 | train_loss:0.673743 | valid_loss:0.786040 | time:2.2s ★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★ Fold: 7 FOLD: 7 | EPOCH:01 | train_loss:2.296701 | valid_loss:0.924307 | time:2.2s FOLD: 7 | EPOCH:02 | train_loss:0.933435 | valid_loss:0.689047 | time:2.2s FOLD: 7 | EPOCH:03 | train_loss:0.850991 | valid_loss:0.659872 | time:2.2s FOLD: 7 | EPOCH:04 | train_loss:0.841571 | valid_loss:0.671413 | time:2.2s FOLD: 7 | EPOCH:05 | train_loss:0.818740 | valid_loss:0.685985 | time:2.2s FOLD: 7 | EPOCH:06 | train_loss:0.826990 | valid_loss:0.676457 | time:2.2s FOLD: 7 | EPOCH:07 | train_loss:0.809806 | valid_loss:0.684370 | time:2.1s FOLD: 7 | EPOCH:08 | train_loss:0.818394 | valid_loss:0.666673 | time:2.2s FOLD: 7 | EPOCH:09 | train_loss:0.811473 | valid_loss:0.664205 | time:2.2s FOLD: 7 | EPOCH:10 | train_loss:0.818371 | valid_loss:0.674103 | time:2.2s FOLD: 7 | EPOCH:11 | train_loss:0.807668 | valid_loss:0.651558 | time:2.2s FOLD: 7 | EPOCH:12 | train_loss:0.799247 | valid_loss:0.674729 | time:2.2s FOLD: 7 | EPOCH:13 | train_loss:0.785723 | valid_loss:0.677723 | time:2.1s FOLD: 7 | EPOCH:14 | train_loss:0.765225 | valid_loss:0.722652 | time:2.2s FOLD: 7 | EPOCH:15 | train_loss:0.748908 | valid_loss:0.725328 | time:2.1s FOLD: 7 | EPOCH:16 | train_loss:0.724189 | valid_loss:0.737869 | time:2.1s FOLD: 7 | EPOCH:17 | train_loss:0.702457 | valid_loss:0.759617 | time:2.2s FOLD: 7 | EPOCH:18 | train_loss:0.690418 | valid_loss:0.767010 | time:2.2s FOLD: 7 | EPOCH:19 | train_loss:0.680520 | valid_loss:0.763514 | time:2.2s FOLD: 7 | EPOCH:20 | train_loss:0.668650 | valid_loss:0.771165 | time:2.2s
rmse = np.sqrt(mean_squared_error(target, oof))
print(f'CV : {rmse:.6f} ')
CV : 0.668040
fig, ax = plt.subplots(figsize=(8, 5))
sns.histplot(target[:, 0], label='正解値', kde=True, color='orange',
stat="density", common_norm=False, alpha=0.3)
sns.histplot(oof, label='OOF', kde=True,
stat="density", common_norm=False, alpha=0.3)
ax.legend()
ax.grid()
plt.show()
### 予測値の統計量を確認
pd.DataFrame(np.expm1(preds)).describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
0 | 59084.0 | 16.914278 | 17.470612 | 0.941772 | 6.85601 | 12.137255 | 21.098681 | 690.938788 |
def make_oof_sub(oof:np.array, pred:np.array, name:str):
oof_res = np.expm1(oof)
oof_res = pd.DataFrame(oof_res, columns=['OOF'])
oof_res.to_csv(f'{OUTPUT_DIR}{name}_oof.csv', index=False)
print(f'{OUTPUT_DIR}{name}_oof.csv is saved.')
preds_res = np.expm1(pred)
test_df = pd.read_json(os.path.join(INPUT_DIR, 'test_data.json'), lines=True)
preds_res = pd.concat([pd.DataFrame({ 'id': test_df['id'] }),
pd.DataFrame({ 'cites': preds_res })],
axis=1)
preds_res.to_csv(f'{OUTPUT_DIR}{name}_sub.csv', index=False)
print(f'{OUTPUT_DIR}{name}_sub.csv is saved.')
with timer('make oof sub'):
make_oof_sub(oof, preds, 'Simple_NN')
<< make oof sub >> Start ../data/interim/2021_03_04/Simple_NN_oof.csv is saved. ../data/interim/2021_03_04/Simple_NN_sub.csv is saved. << make oof sub >> 5.4GB(+0.0GB):3.7sec