NN Baseline (CV: 0.668040 | LB: 0.656322)

・ title, abstractのembedding + doi_citesの値を使ってNN
・ embeddingはhuggingface/transfomerを使用
・ CV: 0.668040 LB: 0.656322

Setting

import warnings
warnings.filterwarnings('ignore')

import os
import re
import gc
import sys
import time
import glob
import math
import json
import pickle
import joblib
import random
import string
import psutil

from pathlib import Path
from datetime import datetime
from tqdm.autonotebook import tqdm
from contextlib import contextmanager

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from texthero import preprocessing as hero_prep
import texthero as hero

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import japanize_matplotlib
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, BertModel

Seed固定

def seed_everything(seed:int==42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

Directory

INPUT_DIR = '/mnt/work/data/ProbSpace'
OUTPUT_DIR = f'../data/interim/{datetime.now():%Y_%m_%d}/'
MODEL_DIR = f'../models/{datetime.now():%Y_%m_%d}/'
# DATA_DIR = '/mnt/work/data/ProbSpace/shimizu/'

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print(f'保存先: {OUTPUT_DIR}')
print(f'モデル保存先: {MODEL_DIR}')
保存先: ../data/interim/2021_03_04/
モデル保存先: ../models/2021_03_04/

timer

@contextmanager
def timer(name:str, slack:bool=True):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    print(f'<< {name} >> Start')
    yield
    
    m1 = p.memory_info()[0] / 2. ** 30
    delta = m1 - m0
    sign = '+' if delta >= 0 else '-'
    delta = math.fabs(delta)
    
    print(f"<< {name} >> {m1:.1f}GB({sign}{delta:.1f}GB):{time.time() - t0:.1f}sec", file=sys.stderr)

Params

class CONFIG:
    SEED = 42
    NFOLDS = 7
    DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

Data Load

def Load_rowjson(file_path:str, cols:list, train:bool=True) -> pd.DataFrame:
    
    df = pd.DataFrame(columns = cols)
    
    with open(file_path, 'r') as f:
        if train==True:
            for i, json_line in tqdm(enumerate(f)):
                data = json.loads(json_line)
                if 'cites' in data:
                    for col in cols:
                        df.at[i, col] = data[col]

                else:
                    continue

        else:
            cols.remove('cites')
            for i, json_line in tqdm(enumerate(f)):
                data = json.loads(json_line)
                for col in cols:
                    df.at[i, col] = data[col]

    df = df.reset_index(drop=True)
    
    return df
with timer('Load Data'):
    
    
    df_cols = ["title", "abstract", "doi_cites", "cites"]
    
    train_df = Load_rowjson(os.path.join(INPUT_DIR, 'train_data.json'), df_cols, train=True)
    test_df = pd.read_json(os.path.join(INPUT_DIR, 'test_data.json'), lines=True)
    test_df = test_df[["title", "abstract", "doi_cites"]]
    
    print('train_df', train_df.shape)
    print('test_df', test_df.shape)
<< Load Data >> Start
0it [00:00, ?it/s]
train_df (15117, 4)
test_df (59084, 3)
<< Load Data >> 1.6GB(+0.5GB):61.1sec

Feature Engineering

対数変換

# doi_citesは対数変換
train_df['cites'] = np.log1p(train_df['cites'].astype('float'))
train_df['doi_cites'] = np.log1p(train_df['doi_cites'].astype('float'))
test_df['doi_cites'] = np.log1p(test_df['doi_cites'].astype('float'))

train, test -> Merge

with timer('Data Merge'):
    
    # 後で分離しやすいように、flag追加
    train_df['flag'] = 0
    test_df['flag'] = 1

    whole_df = pd.concat([train_df,
                          test_df],axis=0).reset_index(drop=True)
<< Data Merge >> Start
<< Data Merge >> 1.6GB(+0.0GB):0.0sec

Title Embedding

def title_embedding(df:pd.DataFrame, pipeline:list, col:str='title') -> np.array:

    title_transformer_mean = []
    title_transformer_max = []
    

    ## 正規表現
    tmp = pd.DataFrame(hero.clean(df[col], pipeline))

    # BERT Model
    ## GPU使用した方が速い
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'device: {DEVICE}')
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.to(DEVICE)
    
    for idx, rows in tqdm(tmp.iterrows(), total=len(tmp)):
        
        title = rows[col]
        
        # Transformer
        input_ids = torch.tensor(tokenizer.encode(title, add_special_tokens=True)).unsqueeze(0)
        outputs = model(input_ids.to(DEVICE))
        
        emb_transformer = outputs[0].cpu().detach().numpy()
        emb_transformer_mean = np.mean(emb_transformer, axis=1)[0, :]
        emb_transformer_max = np.max(emb_transformer, axis=1)[0, :]
        
        title_transformer_mean.append(emb_transformer_mean)
        title_transformer_max.append(emb_transformer_max)
        
        
    title_transformer_mean = np.array(title_transformer_mean)
    title_transformer_max = np.array(title_transformer_max)
    
    print(title_transformer_mean.shape, title_transformer_max.shape)
    
    title_feat = np.concatenate([title_transformer_mean,
                                 title_transformer_max],
                                axis=1)

    del title_transformer_mean, title_transformer_max
    _ = gc.collect()
    
    return title_feat
title_pipeline = [
    hero_prep.remove_diacritics,
    hero_prep.lowercase,
    hero_prep.remove_stopwords,
    hero_prep.tokenize
]


with timer('Title Embedding'):
    print('---- Train ----')
    train_title = title_embedding(train_df, 
                                  title_pipeline,
                                  'title')
    print('---- Test ----')
    test_title = title_embedding(test_df,
                                 title_pipeline,
                                 'title')
<< Title Embedding >> Start
---- Train ----
device: cuda
  0%|          | 0/15117 [00:00<?, ?it/s]
(15117, 768) (15117, 768)
---- Test ----
device: cuda
  0%|          | 0/59084 [00:00<?, ?it/s]
(59084, 768) (59084, 768)
<< Title Embedding >> 4.1GB(+0.6GB):2102.7sec

abstract embedding

def abstract_embedding(df:pd.DataFrame, pipe_list:list, col:str='abstract') -> np.array:

    abstract_transformer_mean = []
    abstract_transformer_max = []
    
    ## 正規表現
    tmp = pd.DataFrame(hero.clean(df[col], pipe_list))
    
    # BERT Model
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'device: {DEVICE}')
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.to(DEVICE)
    
    # 特徴抽出
    for idx, rows in tqdm(tmp.iterrows(), total=len(tmp)):
        
        abstract = rows[col]

        # Transformer
        input_ids = torch.tensor(tokenizer.encode(abstract, add_special_tokens=True)).unsqueeze(0)
        outputs = model(input_ids.to(DEVICE))
        emb_transformer = outputs[0].cpu().detach().numpy()
        emb_transformer_mean = np.mean(emb_transformer, axis=1)[0, :]
        emb_transformer_max = np.max(emb_transformer, axis=1)[0, :]
        
        abstract_transformer_mean.append(emb_transformer_mean)
        abstract_transformer_max.append(emb_transformer_max)
        
        
    abstract_transformer_mean = np.array(abstract_transformer_mean)
    abstract_transformer_max = np.array(abstract_transformer_max)
    
    print(abstract_transformer_mean.shape, 
          abstract_transformer_max.shape)
    
    abstract_feat = np.concatenate([abstract_transformer_mean,
                                    abstract_transformer_max],
                                   axis=1)

    del abstract_transformer_mean, abstract_transformer_max
    _ = gc.collect()
    
    return abstract_feat
abstract_pipeline = [
    hero_prep.remove_diacritics,
    hero_prep.lowercase,
    hero_prep.remove_stopwords,
    hero_prep.tokenize
]


with timer('abstract Embedding'):
    print('---- Train ----')
    train_abstract = abstract_embedding(train_df, 
                                        abstract_pipeline,
                                        'abstract')
    
    print('---- Test ----')
    test_abstract = abstract_embedding(test_df,
                                       abstract_pipeline,
                                       'abstract')
    
    print(train_abstract.shape, test_abstract.shape)
<< abstract Embedding >> Start
---- Train ----
device: cuda
  0%|          | 0/15117 [00:00<?, ?it/s]
(15117, 768) (15117, 768)
---- Test ----
device: cuda
  0%|          | 0/59084 [00:00<?, ?it/s]
(59084, 768) (59084, 768)
(15117, 1536) (59084, 1536)
<< abstract Embedding >> 4.5GB(+0.4GB):2486.3sec
train_doi = train_df['doi_cites'].astype('float').values
test_doi = test_df['doi_cites'].astype('float').values
target = train_df['cites'].astype('float').values

train_doi = np.reshape(train_doi, (-1, 1))
test_doi = np.reshape(test_doi, (-1, 1))
target = np.reshape(target, (-1, 1))

print('----- Train -----')
print(f'title:{train_title.shape} | abstract:{train_abstract.shape} | doi:{train_doi.shape} | target:{target.shape}')
print('----- Test -----')
print(f'title:{test_title.shape} | abstract:{test_abstract.shape} | doi:{test_doi.shape}')
----- Train -----
title:(15117, 1536) | abstract:(15117, 1536) | doi:(15117, 1) | target:(15117, 1)
----- Test -----
title:(59084, 1536) | abstract:(59084, 1536) | doi:(59084, 1)

Embeddingの特徴をPCAで次元圧縮

次元数探索

  • 累積寄与率が0.9を超える次元数を探索
from sklearn.decomposition import PCA

def detect_component(train_array:np.array, test_array:np.array, th:float=0.9) -> int:
    
    whole_array = np.concatenate([train_array, test_array], axis=0)
    
    pca = PCA(n_components=whole_array.shape[1])
    pca.fit(whole_array)
    
    plt.figure(figsize=(20, 5))
    plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)
    plt.show()
    
    tmp = pd.DataFrame(pca.explained_variance_ratio_, columns=['寄与率'])
    tmp = tmp.cumsum()
    tmp = tmp[tmp['寄与率']>th]
    
    print(f'累積寄与率 {th*100:.1f}% を超える次元数 {tmp.index.tolist()[0]}')

    return tmp.index.tolist()[0]
title_component = detect_component(train_title, test_title, 0.9)
abs_component = detect_component(train_abstract, test_abstract, 0.9)
累積寄与率 90.0% を超える次元数 377
累積寄与率 90.0% を超える次元数 539

決定した次元数で再度PCA

def pca_trans(train:np.array, test:np.array, comp:int) -> (np.array, np.array):
    
    pca = PCA(n_components=comp, random_state=CONFIG.SEED)
    pca.fit(np.concatenate([train, test], axis=0))
    
    train_pca = pca.transform(train)
    test_pca = pca.transform(test)
    
    return train_pca, test_pca
with timer('PCA Transform'):
    train_title_pca, test_title_pca = pca_trans(train_title, test_title, title_component)
    train_abs_pca, test_abs_pca = pca_trans(train_abstract, test_abstract, abs_component)
    
    print(train_title_pca.shape, test_title_pca.shape)
    print(train_abs_pca.shape, test_abs_pca.shape)
<< PCA Transform >> Start
(15117, 377) (59084, 377)
(15117, 539) (59084, 539)
<< PCA Transform >> 5.3GB(+0.3GB):70.2sec

NN Model

DataSet

class TrainDataset:
    
    def __init__(self, features_1, features_2, target_doi, target_cites):
        
        self.features_1 = features_1
        self.features_2 = features_2
        self.target_doi = target_doi
        self.target_cites = target_cites
        
    def __len__(self):
        return self.features_1.shape[0]
    
    def __getitem__(self, idx):
        
        dct = {
            'x_1' : torch.tensor(self.features_1[idx, :],
                                 dtype=torch.float),
            'x_2' : torch.tensor(self.features_2[idx, :],
                                 dtype=torch.float),
            'doi' : torch.tensor(self.target_doi[idx, :],
                                 dtype=torch.float),
            'cites' : torch.tensor(self.target_cites[idx, :],
                                   dtype=torch.float),
        }
        
        return dct
class TestDataset:
    
    def __init__(self, features_1, features_2, target_doi):
        
        self.features_1 = features_1
        self.features_2 = features_2
        self.target_doi = target_doi
        
    def __len__(self):
        return self.features_1.shape[0]
    
    def __getitem__(self, idx):
        
        dct = {
            'x_1' : torch.tensor(self.features_1[idx, :],
                                 dtype=torch.float),
            'x_2' : torch.tensor(self.features_2[idx, :],
                                 dtype=torch.float),
            'doi' : torch.tensor(self.target_doi[idx, :],
                                 dtype=torch.float),
        }
        
        return dct

Train & Valid & Test Function

def train_func(model, optimizer, scheduler, loss_fn, dataloader, device, epoch):
    
    model.train()
    final_loss = 0
    
    total_steps = EPOCHS * len(dataloader)
    
    for step, data in enumerate(dataloader):
        
        optimizer.zero_grad()
        
        title = data['x_1'].to(device)
        abstract = data['x_2'].to(device)
        doi = data['doi'].to(device)
        targets = data['cites'].to(device)

        output_cites = model(title, abstract, doi)
        
        loss = loss_fn(output_cites, targets)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss
def valid_func(model, loss_fn, dataloader, device):
    
    model.eval()
    final_loss = 0
    valid_cites_preds = []
    
    for step, data in enumerate(dataloader):
        
        title = data['x_1'].to(device)
        abstract = data['x_2'].to(device)
        doi = data['doi'].to(device)
        targets = data['cites'].to(device)

        output_cites = model(title, abstract, doi)
        
        loss = loss_fn(output_cites, targets)
        final_loss += loss.item()
        
        valid_cites_preds.append(output_cites.detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_cites_preds = np.concatenate(valid_cites_preds)
    
    return final_loss, valid_cites_preds
def test_func(model, dataloader, device):
    
    model.eval()
    preds = []
    
    
    for data in dataloader:
        
        title = data['x_1'].to(device)
        abstract = data['x_2'].to(device)
        doi = data['doi'].to(device)
        
        with torch.no_grad():
            outputs = model(title, abstract, doi)
        
        preds.append(outputs.detach().cpu().numpy())
                
    preds = np.concatenate(preds)
    
    return preds

Model

class TwoHeadModel(nn.Module):
    
    def __init__(self, num_features_1, num_features_2, num_doi, num_cites, hidden_size):
        
        
        super(TwoHeadModel, self).__init__()
        
        self.batch_norm_1 = nn.BatchNorm1d(num_features_1)
        self.dense_1 = nn.utils.weight_norm(nn.Linear(num_features_1, 
                                                      hidden_size))
        self.activation_1 = torch.nn.PReLU(num_parameters=hidden_size, 
                                           init=1.0)
        
        self.batch_norm_2 = nn.BatchNorm1d(num_features_2)
        self.dense_2 = nn.utils.weight_norm(nn.Linear(num_features_2, 
                                                      hidden_size))
        self.activation_2 = torch.nn.PReLU(num_parameters=hidden_size, init=1.0)
        
        self.batch_norm_3 = nn.BatchNorm1d(hidden_size*2 + num_doi)
        self.dropout_3 = nn.Dropout(0.3)
        self.dense_3 = nn.utils.weight_norm(nn.Linear(hidden_size*2 + num_doi,
                                                      num_cites))
        
        
    def forward(self, x1, x2, x3):
        
        x1 = self.batch_norm_1(x1)
        x1 = self.activation_1(self.dense_1(x1))
        
        x2 = self.batch_norm_2(x2)
        x2 = self.activation_2(self.dense_2(x2))
        
        x = torch.cat([x1, x2, x3], dim=1)
        
        x = self.batch_norm_3(x)
        x = self.dropout_3(x)
        x = self.dense_3(x)
    
        return x
## RMSLEで学習させる
def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))

Training

## Cross Validationはdoi_citesの整数部分でStratifiedKFold

DOI_INT = pd.Series(train_df['cites'].astype('float')).astype(int)
plt.hist(DOI_INT, bins=DOI_INT.nunique())
plt.show()
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

EPOCHS = 20
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7


num_features_1 = train_title_pca.shape[1]
num_features_2 = train_abs_pca.shape[1]
num_doi = train_doi.shape[1]
num_cites = target.shape[1]

print(f'num_features_1: {num_features_1} | num_features_2: {num_features_2}')
print(f'num_doi: {num_doi} | num_cites: {num_cites}')

hidden_size = 128

CV = StratifiedKFold(n_splits=CONFIG.NFOLDS, 
                     shuffle=True,
                     random_state=CONFIG.SEED)
num_features_1: 377 | num_features_2: 539
num_doi: 1 | num_cites: 1

oof = np.zeros((train_df.shape[0]))
preds = np.zeros((test_df.shape[0]))

for fold, (tr, te) in enumerate(CV.split(train_df, DOI_INT)):
    
    print('★'*40)
    print(f'Fold: {fold+1}')
    
    X_title_tr = train_title_pca[tr]
    X_title_te = train_title_pca[te]
    
    X_abstract_tr = train_abs_pca[tr]
    X_abstract_te = train_abs_pca[te]
    
    X_doi_tr = train_doi[tr]
    X_doi_te = train_doi[te]
    
    y_tr = target[tr]
    y_te = target[te]

    
    train_dataset = TrainDataset(X_title_tr, X_abstract_tr, X_doi_tr, y_tr)
    valid_dataset = TrainDataset(X_title_te, X_abstract_te, X_doi_te, y_te)
    
    trainloader = torch.utils.data.DataLoader(train_dataset, 
                                              batch_size=BATCH_SIZE, 
                                              shuffle=True)
    
    validloader = torch.utils.data.DataLoader(valid_dataset, 
                                              batch_size=BATCH_SIZE, 
                                              shuffle=False)
    
    model = TwoHeadModel(
        num_features_1 = num_features_1,
        num_features_2 = num_features_2,
        num_doi = num_doi,
        num_cites = num_cites,
        hidden_size = hidden_size,
    )
    
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY)
    
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.1, 
                                              div_factor=1e3, 
                                              max_lr=1e-2, 
                                              epochs=EPOCHS, 
                                              steps_per_epoch=len(trainloader))
    
    loss_fn = RMSELoss
    best_loss = np.inf
    
    
    for epoch in range(EPOCHS):
        
        start_time = time.time()
        train_loss = train_func(model,
                                optimizer,
                                scheduler, 
                                loss_fn, 
                                trainloader,
                                DEVICE, 
                                epoch)
        
        
        valid_loss, valid_cites_preds = valid_func(model,
                                                   loss_fn,
                                                   validloader,
                                                   DEVICE)
        
            
        end_time = time.time()
        print(f"FOLD: {fold+1} | EPOCH:{epoch+1:02d} | train_loss:{train_loss:.6f} | valid_loss:{valid_loss:.6f} | time:{end_time-start_time:.1f}s ")

        if valid_loss < best_loss:
            best_loss = valid_loss
            oof[te] = valid_cites_preds[:, 0]
            torch.save(model.state_dict(),
                       f"{MODEL_DIR}SimpleMLP_{fold+1}.pth")
            
        else:
            continue
            
    
    testdataset = TestDataset(test_title_pca, test_abs_pca, test_doi)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)
    
    model = TwoHeadModel(
        num_features_1 = num_features_1,
        num_features_2 = num_features_2,
        num_doi = num_doi,
        num_cites = num_cites,
        hidden_size = hidden_size,
    )
    
    model.load_state_dict(torch.load(f"{MODEL_DIR}SimpleMLP_{fold+1}.pth"))
    model.to(DEVICE)
    
    preds += test_func(model, testloader, DEVICE)[:, 0]/NFOLDS
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 1
FOLD: 1 | EPOCH:01 | train_loss:2.346136 | valid_loss:1.039456 | time:3.7s 
FOLD: 1 | EPOCH:02 | train_loss:0.917533 | valid_loss:0.683612 | time:3.6s 
FOLD: 1 | EPOCH:03 | train_loss:0.863786 | valid_loss:0.680757 | time:3.7s 
FOLD: 1 | EPOCH:04 | train_loss:0.835902 | valid_loss:0.662268 | time:3.3s 
FOLD: 1 | EPOCH:05 | train_loss:0.830839 | valid_loss:0.646799 | time:3.6s 
FOLD: 1 | EPOCH:06 | train_loss:0.819551 | valid_loss:0.679971 | time:3.7s 
FOLD: 1 | EPOCH:07 | train_loss:0.824818 | valid_loss:0.669431 | time:3.4s 
FOLD: 1 | EPOCH:08 | train_loss:0.823782 | valid_loss:0.695271 | time:3.6s 
FOLD: 1 | EPOCH:09 | train_loss:0.826026 | valid_loss:0.667164 | time:3.8s 
FOLD: 1 | EPOCH:10 | train_loss:0.810591 | valid_loss:0.679541 | time:3.5s 
FOLD: 1 | EPOCH:11 | train_loss:0.819718 | valid_loss:0.681857 | time:2.8s 
FOLD: 1 | EPOCH:12 | train_loss:0.800127 | valid_loss:0.672954 | time:2.3s 
FOLD: 1 | EPOCH:13 | train_loss:0.787671 | valid_loss:0.687458 | time:2.5s 
FOLD: 1 | EPOCH:14 | train_loss:0.780256 | valid_loss:0.701777 | time:4.1s 
FOLD: 1 | EPOCH:15 | train_loss:0.752611 | valid_loss:0.711099 | time:3.8s 
FOLD: 1 | EPOCH:16 | train_loss:0.725169 | valid_loss:0.733798 | time:3.6s 
FOLD: 1 | EPOCH:17 | train_loss:0.701010 | valid_loss:0.738370 | time:3.6s 
FOLD: 1 | EPOCH:18 | train_loss:0.690245 | valid_loss:0.758462 | time:3.4s 
FOLD: 1 | EPOCH:19 | train_loss:0.676085 | valid_loss:0.758223 | time:3.8s 
FOLD: 1 | EPOCH:20 | train_loss:0.682744 | valid_loss:0.757722 | time:3.4s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 2
FOLD: 2 | EPOCH:01 | train_loss:2.268256 | valid_loss:0.951040 | time:3.5s 
FOLD: 2 | EPOCH:02 | train_loss:0.923570 | valid_loss:0.699678 | time:3.4s 
FOLD: 2 | EPOCH:03 | train_loss:0.849738 | valid_loss:0.681367 | time:3.6s 
FOLD: 2 | EPOCH:04 | train_loss:0.840315 | valid_loss:0.675227 | time:3.1s 
FOLD: 2 | EPOCH:05 | train_loss:0.832623 | valid_loss:0.672421 | time:2.3s 
FOLD: 2 | EPOCH:06 | train_loss:0.822668 | valid_loss:0.678278 | time:2.5s 
FOLD: 2 | EPOCH:07 | train_loss:0.822654 | valid_loss:0.685254 | time:4.0s 
FOLD: 2 | EPOCH:08 | train_loss:0.812432 | valid_loss:0.658895 | time:3.5s 
FOLD: 2 | EPOCH:09 | train_loss:0.807763 | valid_loss:0.682751 | time:3.3s 
FOLD: 2 | EPOCH:10 | train_loss:0.804930 | valid_loss:0.687398 | time:3.3s 
FOLD: 2 | EPOCH:11 | train_loss:0.800834 | valid_loss:0.687234 | time:3.4s 
FOLD: 2 | EPOCH:12 | train_loss:0.786160 | valid_loss:0.708299 | time:3.2s 
FOLD: 2 | EPOCH:13 | train_loss:0.767990 | valid_loss:0.704966 | time:3.7s 
FOLD: 2 | EPOCH:14 | train_loss:0.757462 | valid_loss:0.721444 | time:3.8s 
FOLD: 2 | EPOCH:15 | train_loss:0.738536 | valid_loss:0.744819 | time:3.2s 
FOLD: 2 | EPOCH:16 | train_loss:0.719331 | valid_loss:0.748718 | time:3.5s 
FOLD: 2 | EPOCH:17 | train_loss:0.691645 | valid_loss:0.771285 | time:3.4s 
FOLD: 2 | EPOCH:18 | train_loss:0.684977 | valid_loss:0.778157 | time:2.8s 
FOLD: 2 | EPOCH:19 | train_loss:0.680941 | valid_loss:0.774300 | time:2.3s 
FOLD: 2 | EPOCH:20 | train_loss:0.665683 | valid_loss:0.774228 | time:2.7s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 3
FOLD: 3 | EPOCH:01 | train_loss:2.292497 | valid_loss:0.962321 | time:4.3s 
FOLD: 3 | EPOCH:02 | train_loss:0.924057 | valid_loss:0.701687 | time:3.9s 
FOLD: 3 | EPOCH:03 | train_loss:0.854219 | valid_loss:0.699506 | time:3.9s 
FOLD: 3 | EPOCH:04 | train_loss:0.837900 | valid_loss:0.684892 | time:3.5s 
FOLD: 3 | EPOCH:05 | train_loss:0.835563 | valid_loss:0.665736 | time:3.7s 
FOLD: 3 | EPOCH:06 | train_loss:0.827823 | valid_loss:0.665580 | time:3.5s 
FOLD: 3 | EPOCH:07 | train_loss:0.810266 | valid_loss:0.675681 | time:3.9s 
FOLD: 3 | EPOCH:08 | train_loss:0.813502 | valid_loss:0.681831 | time:4.0s 
FOLD: 3 | EPOCH:09 | train_loss:0.811910 | valid_loss:0.671052 | time:3.9s 
FOLD: 3 | EPOCH:10 | train_loss:0.804153 | valid_loss:0.670744 | time:3.6s 
FOLD: 3 | EPOCH:11 | train_loss:0.813063 | valid_loss:0.687331 | time:3.4s 
FOLD: 3 | EPOCH:12 | train_loss:0.797916 | valid_loss:0.707993 | time:4.3s 
FOLD: 3 | EPOCH:13 | train_loss:0.794149 | valid_loss:0.716200 | time:4.6s 
FOLD: 3 | EPOCH:14 | train_loss:0.763399 | valid_loss:0.712297 | time:4.5s 
FOLD: 3 | EPOCH:15 | train_loss:0.750802 | valid_loss:0.730693 | time:4.1s 
FOLD: 3 | EPOCH:16 | train_loss:0.717734 | valid_loss:0.743486 | time:3.6s 
FOLD: 3 | EPOCH:17 | train_loss:0.695795 | valid_loss:0.761950 | time:3.6s 
FOLD: 3 | EPOCH:18 | train_loss:0.681331 | valid_loss:0.768960 | time:3.5s 
FOLD: 3 | EPOCH:19 | train_loss:0.670938 | valid_loss:0.774660 | time:3.5s 
FOLD: 3 | EPOCH:20 | train_loss:0.674210 | valid_loss:0.769835 | time:3.6s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 4
FOLD: 4 | EPOCH:01 | train_loss:2.253462 | valid_loss:0.955478 | time:3.6s 
FOLD: 4 | EPOCH:02 | train_loss:0.922848 | valid_loss:0.720584 | time:3.5s 
FOLD: 4 | EPOCH:03 | train_loss:0.850577 | valid_loss:0.666364 | time:3.6s 
FOLD: 4 | EPOCH:04 | train_loss:0.829753 | valid_loss:0.694528 | time:3.6s 
FOLD: 4 | EPOCH:05 | train_loss:0.829714 | valid_loss:0.688441 | time:3.6s 
FOLD: 4 | EPOCH:06 | train_loss:0.811148 | valid_loss:0.667915 | time:3.6s 
FOLD: 4 | EPOCH:07 | train_loss:0.816485 | valid_loss:0.677920 | time:3.5s 
FOLD: 4 | EPOCH:08 | train_loss:0.808134 | valid_loss:0.680463 | time:3.5s 
FOLD: 4 | EPOCH:09 | train_loss:0.811411 | valid_loss:0.679146 | time:3.6s 
FOLD: 4 | EPOCH:10 | train_loss:0.818291 | valid_loss:0.691657 | time:3.6s 
FOLD: 4 | EPOCH:11 | train_loss:0.807316 | valid_loss:0.692099 | time:3.7s 
FOLD: 4 | EPOCH:12 | train_loss:0.798671 | valid_loss:0.691802 | time:3.6s 
FOLD: 4 | EPOCH:13 | train_loss:0.778866 | valid_loss:0.697141 | time:3.6s 
FOLD: 4 | EPOCH:14 | train_loss:0.765021 | valid_loss:0.715104 | time:3.5s 
FOLD: 4 | EPOCH:15 | train_loss:0.746003 | valid_loss:0.736405 | time:3.7s 
FOLD: 4 | EPOCH:16 | train_loss:0.724310 | valid_loss:0.744852 | time:3.6s 
FOLD: 4 | EPOCH:17 | train_loss:0.695327 | valid_loss:0.749416 | time:3.5s 
FOLD: 4 | EPOCH:18 | train_loss:0.695899 | valid_loss:0.771669 | time:3.5s 
FOLD: 4 | EPOCH:19 | train_loss:0.685250 | valid_loss:0.772929 | time:3.5s 
FOLD: 4 | EPOCH:20 | train_loss:0.670131 | valid_loss:0.772308 | time:3.5s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 5
FOLD: 5 | EPOCH:01 | train_loss:2.285175 | valid_loss:0.945952 | time:3.5s 
FOLD: 5 | EPOCH:02 | train_loss:0.927173 | valid_loss:0.718184 | time:3.5s 
FOLD: 5 | EPOCH:03 | train_loss:0.847746 | valid_loss:0.673975 | time:3.5s 
FOLD: 5 | EPOCH:04 | train_loss:0.837297 | valid_loss:0.669784 | time:3.7s 
FOLD: 5 | EPOCH:05 | train_loss:0.821963 | valid_loss:0.665123 | time:3.6s 
FOLD: 5 | EPOCH:06 | train_loss:0.819732 | valid_loss:0.669295 | time:3.4s 
FOLD: 5 | EPOCH:07 | train_loss:0.811684 | valid_loss:0.677448 | time:3.5s 
FOLD: 5 | EPOCH:08 | train_loss:0.820014 | valid_loss:0.672548 | time:3.5s 
FOLD: 5 | EPOCH:09 | train_loss:0.810965 | valid_loss:0.678159 | time:3.5s 
FOLD: 5 | EPOCH:10 | train_loss:0.799571 | valid_loss:0.690159 | time:3.5s 
FOLD: 5 | EPOCH:11 | train_loss:0.807466 | valid_loss:0.679499 | time:3.8s 
FOLD: 5 | EPOCH:12 | train_loss:0.787137 | valid_loss:0.693354 | time:3.7s 
FOLD: 5 | EPOCH:13 | train_loss:0.784178 | valid_loss:0.713389 | time:4.0s 
FOLD: 5 | EPOCH:14 | train_loss:0.766159 | valid_loss:0.709375 | time:2.4s 
FOLD: 5 | EPOCH:15 | train_loss:0.747560 | valid_loss:0.723371 | time:3.0s 
FOLD: 5 | EPOCH:16 | train_loss:0.721054 | valid_loss:0.735085 | time:2.9s 
FOLD: 5 | EPOCH:17 | train_loss:0.709306 | valid_loss:0.748228 | time:2.7s 
FOLD: 5 | EPOCH:18 | train_loss:0.700590 | valid_loss:0.756795 | time:2.6s 
FOLD: 5 | EPOCH:19 | train_loss:0.691556 | valid_loss:0.750856 | time:2.3s 
FOLD: 5 | EPOCH:20 | train_loss:0.694626 | valid_loss:0.746628 | time:2.5s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 6
FOLD: 6 | EPOCH:01 | train_loss:2.284253 | valid_loss:0.937912 | time:2.3s 
FOLD: 6 | EPOCH:02 | train_loss:0.919130 | valid_loss:0.699395 | time:2.3s 
FOLD: 6 | EPOCH:03 | train_loss:0.848985 | valid_loss:0.688490 | time:2.3s 
FOLD: 6 | EPOCH:04 | train_loss:0.838046 | valid_loss:0.695203 | time:2.2s 
FOLD: 6 | EPOCH:05 | train_loss:0.828637 | valid_loss:0.691719 | time:2.4s 
FOLD: 6 | EPOCH:06 | train_loss:0.823461 | valid_loss:0.684958 | time:2.4s 
FOLD: 6 | EPOCH:07 | train_loss:0.807477 | valid_loss:0.686002 | time:2.4s 
FOLD: 6 | EPOCH:08 | train_loss:0.814183 | valid_loss:0.696132 | time:2.5s 
FOLD: 6 | EPOCH:09 | train_loss:0.801512 | valid_loss:0.703734 | time:2.4s 
FOLD: 6 | EPOCH:10 | train_loss:0.813860 | valid_loss:0.703319 | time:2.3s 
FOLD: 6 | EPOCH:11 | train_loss:0.798274 | valid_loss:0.686019 | time:2.4s 
FOLD: 6 | EPOCH:12 | train_loss:0.798365 | valid_loss:0.698568 | time:2.3s 
FOLD: 6 | EPOCH:13 | train_loss:0.777323 | valid_loss:0.708754 | time:2.2s 
FOLD: 6 | EPOCH:14 | train_loss:0.765338 | valid_loss:0.711898 | time:2.2s 
FOLD: 6 | EPOCH:15 | train_loss:0.747511 | valid_loss:0.738144 | time:2.2s 
FOLD: 6 | EPOCH:16 | train_loss:0.726277 | valid_loss:0.749122 | time:2.2s 
FOLD: 6 | EPOCH:17 | train_loss:0.704968 | valid_loss:0.751241 | time:2.2s 
FOLD: 6 | EPOCH:18 | train_loss:0.690608 | valid_loss:0.755787 | time:2.2s 
FOLD: 6 | EPOCH:19 | train_loss:0.670668 | valid_loss:0.765387 | time:2.2s 
FOLD: 6 | EPOCH:20 | train_loss:0.673743 | valid_loss:0.786040 | time:2.2s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 7
FOLD: 7 | EPOCH:01 | train_loss:2.296701 | valid_loss:0.924307 | time:2.2s 
FOLD: 7 | EPOCH:02 | train_loss:0.933435 | valid_loss:0.689047 | time:2.2s 
FOLD: 7 | EPOCH:03 | train_loss:0.850991 | valid_loss:0.659872 | time:2.2s 
FOLD: 7 | EPOCH:04 | train_loss:0.841571 | valid_loss:0.671413 | time:2.2s 
FOLD: 7 | EPOCH:05 | train_loss:0.818740 | valid_loss:0.685985 | time:2.2s 
FOLD: 7 | EPOCH:06 | train_loss:0.826990 | valid_loss:0.676457 | time:2.2s 
FOLD: 7 | EPOCH:07 | train_loss:0.809806 | valid_loss:0.684370 | time:2.1s 
FOLD: 7 | EPOCH:08 | train_loss:0.818394 | valid_loss:0.666673 | time:2.2s 
FOLD: 7 | EPOCH:09 | train_loss:0.811473 | valid_loss:0.664205 | time:2.2s 
FOLD: 7 | EPOCH:10 | train_loss:0.818371 | valid_loss:0.674103 | time:2.2s 
FOLD: 7 | EPOCH:11 | train_loss:0.807668 | valid_loss:0.651558 | time:2.2s 
FOLD: 7 | EPOCH:12 | train_loss:0.799247 | valid_loss:0.674729 | time:2.2s 
FOLD: 7 | EPOCH:13 | train_loss:0.785723 | valid_loss:0.677723 | time:2.1s 
FOLD: 7 | EPOCH:14 | train_loss:0.765225 | valid_loss:0.722652 | time:2.2s 
FOLD: 7 | EPOCH:15 | train_loss:0.748908 | valid_loss:0.725328 | time:2.1s 
FOLD: 7 | EPOCH:16 | train_loss:0.724189 | valid_loss:0.737869 | time:2.1s 
FOLD: 7 | EPOCH:17 | train_loss:0.702457 | valid_loss:0.759617 | time:2.2s 
FOLD: 7 | EPOCH:18 | train_loss:0.690418 | valid_loss:0.767010 | time:2.2s 
FOLD: 7 | EPOCH:19 | train_loss:0.680520 | valid_loss:0.763514 | time:2.2s 
FOLD: 7 | EPOCH:20 | train_loss:0.668650 | valid_loss:0.771165 | time:2.2s 
rmse = np.sqrt(mean_squared_error(target, oof))
print(f'CV : {rmse:.6f} ')
CV : 0.668040 
fig, ax = plt.subplots(figsize=(8, 5))
sns.histplot(target[:, 0], label='正解値', kde=True, color='orange',
             stat="density", common_norm=False, alpha=0.3)
sns.histplot(oof, label='OOF', kde=True,
             stat="density", common_norm=False, alpha=0.3)
ax.legend()
ax.grid()
plt.show()
  • 平均値付近に固まってる
### 予測値の統計量を確認
pd.DataFrame(np.expm1(preds)).describe().T
count mean std min 25% 50% 75% max
0 59084.0 16.914278 17.470612 0.941772 6.85601 12.137255 21.098681 690.938788

oofとsubを保存

def make_oof_sub(oof:np.array, pred:np.array, name:str):
    
    oof_res = np.expm1(oof)
    oof_res = pd.DataFrame(oof_res, columns=['OOF'])
    oof_res.to_csv(f'{OUTPUT_DIR}{name}_oof.csv', index=False)
    print(f'{OUTPUT_DIR}{name}_oof.csv is saved.')
    
    preds_res = np.expm1(pred)
    test_df = pd.read_json(os.path.join(INPUT_DIR, 'test_data.json'), lines=True)
    preds_res = pd.concat([pd.DataFrame({ 'id': test_df['id'] }), 
                           pd.DataFrame({ 'cites': preds_res })], 
                          axis=1)
    preds_res.to_csv(f'{OUTPUT_DIR}{name}_sub.csv', index=False)
    print(f'{OUTPUT_DIR}{name}_sub.csv is saved.')
with timer('make oof sub'):
    make_oof_sub(oof, preds, 'Simple_NN')
<< make oof sub >> Start
../data/interim/2021_03_04/Simple_NN_oof.csv is saved.
../data/interim/2021_03_04/Simple_NN_sub.csv is saved.
<< make oof sub >> 5.4GB(+0.0GB):3.7sec

添付データ

  • NN_Baseline.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241221T131145Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。