論文の被引用数予測

研究価値を最大化させるワーディングとは

賞金: 100,000 参加ユーザー数: 182 約3年前に終了

NN Baseline (CV: 0.668040 | LB: 0.656322)

・ title, abstractのembedding + doi_citesの値を使ってNN
・ embeddingはhuggingface/transfomerを使用
・ CV: 0.668040 LB: 0.656322

Setting

import warnings
warnings.filterwarnings('ignore')

import os
import re
import gc
import sys
import time
import glob
import math
import json
import pickle
import joblib
import random
import string
import psutil

from pathlib import Path
from datetime import datetime
from tqdm.autonotebook import tqdm
from contextlib import contextmanager

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from texthero import preprocessing as hero_prep
import texthero as hero

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import japanize_matplotlib
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, BertModel

Seed固定

def seed_everything(seed:int==42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

Directory

INPUT_DIR = '/mnt/work/data/ProbSpace'
OUTPUT_DIR = f'../data/interim/{datetime.now():%Y_%m_%d}/'
MODEL_DIR = f'../models/{datetime.now():%Y_%m_%d}/'
# DATA_DIR = '/mnt/work/data/ProbSpace/shimizu/'

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print(f'保存先: {OUTPUT_DIR}')
print(f'モデル保存先: {MODEL_DIR}')
保存先: ../data/interim/2021_03_04/
モデル保存先: ../models/2021_03_04/

timer

@contextmanager
def timer(name:str, slack:bool=True):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    print(f'<< {name} >> Start')
    yield
    
    m1 = p.memory_info()[0] / 2. ** 30
    delta = m1 - m0
    sign = '+' if delta >= 0 else '-'
    delta = math.fabs(delta)
    
    print(f"<< {name} >> {m1:.1f}GB({sign}{delta:.1f}GB):{time.time() - t0:.1f}sec", file=sys.stderr)

Params

class CONFIG:
    SEED = 42
    NFOLDS = 7
    DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

Data Load

def Load_rowjson(file_path:str, cols:list, train:bool=True) -> pd.DataFrame:
    
    df = pd.DataFrame(columns = cols)
    
    with open(file_path, 'r') as f:
        if train==True:
            for i, json_line in tqdm(enumerate(f)):
                data = json.loads(json_line)
                if 'cites' in data:
                    for col in cols:
                        df.at[i, col] = data[col]

                else:
                    continue

        else:
            cols.remove('cites')
            for i, json_line in tqdm(enumerate(f)):
                data = json.loads(json_line)
                for col in cols:
                    df.at[i, col] = data[col]

    df = df.reset_index(drop=True)
    
    return df
with timer('Load Data'):
    
    
    df_cols = ["title", "abstract", "doi_cites", "cites"]
    
    train_df = Load_rowjson(os.path.join(INPUT_DIR, 'train_data.json'), df_cols, train=True)
    test_df = pd.read_json(os.path.join(INPUT_DIR, 'test_data.json'), lines=True)
    test_df = test_df[["title", "abstract", "doi_cites"]]
    
    print('train_df', train_df.shape)
    print('test_df', test_df.shape)
<< Load Data >> Start
0it [00:00, ?it/s]
train_df (15117, 4)
test_df (59084, 3)
<< Load Data >> 1.6GB(+0.5GB):61.1sec

Feature Engineering

対数変換

# doi_citesは対数変換
train_df['cites'] = np.log1p(train_df['cites'].astype('float'))
train_df['doi_cites'] = np.log1p(train_df['doi_cites'].astype('float'))
test_df['doi_cites'] = np.log1p(test_df['doi_cites'].astype('float'))

train, test -> Merge

with timer('Data Merge'):
    
    # 後で分離しやすいように、flag追加
    train_df['flag'] = 0
    test_df['flag'] = 1

    whole_df = pd.concat([train_df,
                          test_df],axis=0).reset_index(drop=True)
<< Data Merge >> Start
<< Data Merge >> 1.6GB(+0.0GB):0.0sec

Title Embedding

def title_embedding(df:pd.DataFrame, pipeline:list, col:str='title') -> np.array:

    title_transformer_mean = []
    title_transformer_max = []
    

    ## 正規表現
    tmp = pd.DataFrame(hero.clean(df[col], pipeline))

    # BERT Model
    ## GPU使用した方が速い
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'device: {DEVICE}')
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.to(DEVICE)
    
    for idx, rows in tqdm(tmp.iterrows(), total=len(tmp)):
        
        title = rows[col]
        
        # Transformer
        input_ids = torch.tensor(tokenizer.encode(title, add_special_tokens=True)).unsqueeze(0)
        outputs = model(input_ids.to(DEVICE))
        
        emb_transformer = outputs[0].cpu().detach().numpy()
        emb_transformer_mean = np.mean(emb_transformer, axis=1)[0, :]
        emb_transformer_max = np.max(emb_transformer, axis=1)[0, :]
        
        title_transformer_mean.append(emb_transformer_mean)
        title_transformer_max.append(emb_transformer_max)
        
        
    title_transformer_mean = np.array(title_transformer_mean)
    title_transformer_max = np.array(title_transformer_max)
    
    print(title_transformer_mean.shape, title_transformer_max.shape)
    
    title_feat = np.concatenate([title_transformer_mean,
                                 title_transformer_max],
                                axis=1)

    del title_transformer_mean, title_transformer_max
    _ = gc.collect()
    
    return title_feat
title_pipeline = [
    hero_prep.remove_diacritics,
    hero_prep.lowercase,
    hero_prep.remove_stopwords,
    hero_prep.tokenize
]


with timer('Title Embedding'):
    print('---- Train ----')
    train_title = title_embedding(train_df, 
                                  title_pipeline,
                                  'title')
    print('---- Test ----')
    test_title = title_embedding(test_df,
                                 title_pipeline,
                                 'title')
<< Title Embedding >> Start
---- Train ----
device: cuda
  0%|          | 0/15117 [00:00<?, ?it/s]
(15117, 768) (15117, 768)
---- Test ----
device: cuda
  0%|          | 0/59084 [00:00<?, ?it/s]
(59084, 768) (59084, 768)
<< Title Embedding >> 4.1GB(+0.6GB):2102.7sec

abstract embedding

def abstract_embedding(df:pd.DataFrame, pipe_list:list, col:str='abstract') -> np.array:

    abstract_transformer_mean = []
    abstract_transformer_max = []
    
    ## 正規表現
    tmp = pd.DataFrame(hero.clean(df[col], pipe_list))
    
    # BERT Model
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'device: {DEVICE}')
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model.to(DEVICE)
    
    # 特徴抽出
    for idx, rows in tqdm(tmp.iterrows(), total=len(tmp)):
        
        abstract = rows[col]

        # Transformer
        input_ids = torch.tensor(tokenizer.encode(abstract, add_special_tokens=True)).unsqueeze(0)
        outputs = model(input_ids.to(DEVICE))
        emb_transformer = outputs[0].cpu().detach().numpy()
        emb_transformer_mean = np.mean(emb_transformer, axis=1)[0, :]
        emb_transformer_max = np.max(emb_transformer, axis=1)[0, :]
        
        abstract_transformer_mean.append(emb_transformer_mean)
        abstract_transformer_max.append(emb_transformer_max)
        
        
    abstract_transformer_mean = np.array(abstract_transformer_mean)
    abstract_transformer_max = np.array(abstract_transformer_max)
    
    print(abstract_transformer_mean.shape, 
          abstract_transformer_max.shape)
    
    abstract_feat = np.concatenate([abstract_transformer_mean,
                                    abstract_transformer_max],
                                   axis=1)

    del abstract_transformer_mean, abstract_transformer_max
    _ = gc.collect()
    
    return abstract_feat
abstract_pipeline = [
    hero_prep.remove_diacritics,
    hero_prep.lowercase,
    hero_prep.remove_stopwords,
    hero_prep.tokenize
]


with timer('abstract Embedding'):
    print('---- Train ----')
    train_abstract = abstract_embedding(train_df, 
                                        abstract_pipeline,
                                        'abstract')
    
    print('---- Test ----')
    test_abstract = abstract_embedding(test_df,
                                       abstract_pipeline,
                                       'abstract')
    
    print(train_abstract.shape, test_abstract.shape)
<< abstract Embedding >> Start
---- Train ----
device: cuda
  0%|          | 0/15117 [00:00<?, ?it/s]
(15117, 768) (15117, 768)
---- Test ----
device: cuda
  0%|          | 0/59084 [00:00<?, ?it/s]
(59084, 768) (59084, 768)
(15117, 1536) (59084, 1536)
<< abstract Embedding >> 4.5GB(+0.4GB):2486.3sec
train_doi = train_df['doi_cites'].astype('float').values
test_doi = test_df['doi_cites'].astype('float').values
target = train_df['cites'].astype('float').values

train_doi = np.reshape(train_doi, (-1, 1))
test_doi = np.reshape(test_doi, (-1, 1))
target = np.reshape(target, (-1, 1))

print('----- Train -----')
print(f'title:{train_title.shape} | abstract:{train_abstract.shape} | doi:{train_doi.shape} | target:{target.shape}')
print('----- Test -----')
print(f'title:{test_title.shape} | abstract:{test_abstract.shape} | doi:{test_doi.shape}')
----- Train -----
title:(15117, 1536) | abstract:(15117, 1536) | doi:(15117, 1) | target:(15117, 1)
----- Test -----
title:(59084, 1536) | abstract:(59084, 1536) | doi:(59084, 1)

Embeddingの特徴をPCAで次元圧縮

次元数探索

  • 累積寄与率が0.9を超える次元数を探索
from sklearn.decomposition import PCA

def detect_component(train_array:np.array, test_array:np.array, th:float=0.9) -> int:
    
    whole_array = np.concatenate([train_array, test_array], axis=0)
    
    pca = PCA(n_components=whole_array.shape[1])
    pca.fit(whole_array)
    
    plt.figure(figsize=(20, 5))
    plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)
    plt.show()
    
    tmp = pd.DataFrame(pca.explained_variance_ratio_, columns=['寄与率'])
    tmp = tmp.cumsum()
    tmp = tmp[tmp['寄与率']>th]
    
    print(f'累積寄与率 {th*100:.1f}% を超える次元数 {tmp.index.tolist()[0]}')

    return tmp.index.tolist()[0]
title_component = detect_component(train_title, test_title, 0.9)
abs_component = detect_component(train_abstract, test_abstract, 0.9)
累積寄与率 90.0% を超える次元数 377
累積寄与率 90.0% を超える次元数 539

決定した次元数で再度PCA

def pca_trans(train:np.array, test:np.array, comp:int) -> (np.array, np.array):
    
    pca = PCA(n_components=comp, random_state=CONFIG.SEED)
    pca.fit(np.concatenate([train, test], axis=0))
    
    train_pca = pca.transform(train)
    test_pca = pca.transform(test)
    
    return train_pca, test_pca
with timer('PCA Transform'):
    train_title_pca, test_title_pca = pca_trans(train_title, test_title, title_component)
    train_abs_pca, test_abs_pca = pca_trans(train_abstract, test_abstract, abs_component)
    
    print(train_title_pca.shape, test_title_pca.shape)
    print(train_abs_pca.shape, test_abs_pca.shape)
<< PCA Transform >> Start
(15117, 377) (59084, 377)
(15117, 539) (59084, 539)
<< PCA Transform >> 5.3GB(+0.3GB):70.2sec

NN Model

DataSet

class TrainDataset:
    
    def __init__(self, features_1, features_2, target_doi, target_cites):
        
        self.features_1 = features_1
        self.features_2 = features_2
        self.target_doi = target_doi
        self.target_cites = target_cites
        
    def __len__(self):
        return self.features_1.shape[0]
    
    def __getitem__(self, idx):
        
        dct = {
            'x_1' : torch.tensor(self.features_1[idx, :],
                                 dtype=torch.float),
            'x_2' : torch.tensor(self.features_2[idx, :],
                                 dtype=torch.float),
            'doi' : torch.tensor(self.target_doi[idx, :],
                                 dtype=torch.float),
            'cites' : torch.tensor(self.target_cites[idx, :],
                                   dtype=torch.float),
        }
        
        return dct
class TestDataset:
    
    def __init__(self, features_1, features_2, target_doi):
        
        self.features_1 = features_1
        self.features_2 = features_2
        self.target_doi = target_doi
        
    def __len__(self):
        return self.features_1.shape[0]
    
    def __getitem__(self, idx):
        
        dct = {
            'x_1' : torch.tensor(self.features_1[idx, :],
                                 dtype=torch.float),
            'x_2' : torch.tensor(self.features_2[idx, :],
                                 dtype=torch.float),
            'doi' : torch.tensor(self.target_doi[idx, :],
                                 dtype=torch.float),
        }
        
        return dct

Train & Valid & Test Function

def train_func(model, optimizer, scheduler, loss_fn, dataloader, device, epoch):
    
    model.train()
    final_loss = 0
    
    total_steps = EPOCHS * len(dataloader)
    
    for step, data in enumerate(dataloader):
        
        optimizer.zero_grad()
        
        title = data['x_1'].to(device)
        abstract = data['x_2'].to(device)
        doi = data['doi'].to(device)
        targets = data['cites'].to(device)

        output_cites = model(title, abstract, doi)
        
        loss = loss_fn(output_cites, targets)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss
def valid_func(model, loss_fn, dataloader, device):
    
    model.eval()
    final_loss = 0
    valid_cites_preds = []
    
    for step, data in enumerate(dataloader):
        
        title = data['x_1'].to(device)
        abstract = data['x_2'].to(device)
        doi = data['doi'].to(device)
        targets = data['cites'].to(device)

        output_cites = model(title, abstract, doi)
        
        loss = loss_fn(output_cites, targets)
        final_loss += loss.item()
        
        valid_cites_preds.append(output_cites.detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_cites_preds = np.concatenate(valid_cites_preds)
    
    return final_loss, valid_cites_preds
def test_func(model, dataloader, device):
    
    model.eval()
    preds = []
    
    
    for data in dataloader:
        
        title = data['x_1'].to(device)
        abstract = data['x_2'].to(device)
        doi = data['doi'].to(device)
        
        with torch.no_grad():
            outputs = model(title, abstract, doi)
        
        preds.append(outputs.detach().cpu().numpy())
                
    preds = np.concatenate(preds)
    
    return preds

Model

class TwoHeadModel(nn.Module):
    
    def __init__(self, num_features_1, num_features_2, num_doi, num_cites, hidden_size):
        
        
        super(TwoHeadModel, self).__init__()
        
        self.batch_norm_1 = nn.BatchNorm1d(num_features_1)
        self.dense_1 = nn.utils.weight_norm(nn.Linear(num_features_1, 
                                                      hidden_size))
        self.activation_1 = torch.nn.PReLU(num_parameters=hidden_size, 
                                           init=1.0)
        
        self.batch_norm_2 = nn.BatchNorm1d(num_features_2)
        self.dense_2 = nn.utils.weight_norm(nn.Linear(num_features_2, 
                                                      hidden_size))
        self.activation_2 = torch.nn.PReLU(num_parameters=hidden_size, init=1.0)
        
        self.batch_norm_3 = nn.BatchNorm1d(hidden_size*2 + num_doi)
        self.dropout_3 = nn.Dropout(0.3)
        self.dense_3 = nn.utils.weight_norm(nn.Linear(hidden_size*2 + num_doi,
                                                      num_cites))
        
        
    def forward(self, x1, x2, x3):
        
        x1 = self.batch_norm_1(x1)
        x1 = self.activation_1(self.dense_1(x1))
        
        x2 = self.batch_norm_2(x2)
        x2 = self.activation_2(self.dense_2(x2))
        
        x = torch.cat([x1, x2, x3], dim=1)
        
        x = self.batch_norm_3(x)
        x = self.dropout_3(x)
        x = self.dense_3(x)
    
        return x
## RMSLEで学習させる
def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))

Training

## Cross Validationはdoi_citesの整数部分でStratifiedKFold

DOI_INT = pd.Series(train_df['cites'].astype('float')).astype(int)
plt.hist(DOI_INT, bins=DOI_INT.nunique())
plt.show()
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

EPOCHS = 20
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7


num_features_1 = train_title_pca.shape[1]
num_features_2 = train_abs_pca.shape[1]
num_doi = train_doi.shape[1]
num_cites = target.shape[1]

print(f'num_features_1: {num_features_1} | num_features_2: {num_features_2}')
print(f'num_doi: {num_doi} | num_cites: {num_cites}')

hidden_size = 128

CV = StratifiedKFold(n_splits=CONFIG.NFOLDS, 
                     shuffle=True,
                     random_state=CONFIG.SEED)
num_features_1: 377 | num_features_2: 539
num_doi: 1 | num_cites: 1

oof = np.zeros((train_df.shape[0]))
preds = np.zeros((test_df.shape[0]))

for fold, (tr, te) in enumerate(CV.split(train_df, DOI_INT)):
    
    print('★'*40)
    print(f'Fold: {fold+1}')
    
    X_title_tr = train_title_pca[tr]
    X_title_te = train_title_pca[te]
    
    X_abstract_tr = train_abs_pca[tr]
    X_abstract_te = train_abs_pca[te]
    
    X_doi_tr = train_doi[tr]
    X_doi_te = train_doi[te]
    
    y_tr = target[tr]
    y_te = target[te]

    
    train_dataset = TrainDataset(X_title_tr, X_abstract_tr, X_doi_tr, y_tr)
    valid_dataset = TrainDataset(X_title_te, X_abstract_te, X_doi_te, y_te)
    
    trainloader = torch.utils.data.DataLoader(train_dataset, 
                                              batch_size=BATCH_SIZE, 
                                              shuffle=True)
    
    validloader = torch.utils.data.DataLoader(valid_dataset, 
                                              batch_size=BATCH_SIZE, 
                                              shuffle=False)
    
    model = TwoHeadModel(
        num_features_1 = num_features_1,
        num_features_2 = num_features_2,
        num_doi = num_doi,
        num_cites = num_cites,
        hidden_size = hidden_size,
    )
    
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=LEARNING_RATE,
                                 weight_decay=WEIGHT_DECAY)
    
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                              pct_start=0.1, 
                                              div_factor=1e3, 
                                              max_lr=1e-2, 
                                              epochs=EPOCHS, 
                                              steps_per_epoch=len(trainloader))
    
    loss_fn = RMSELoss
    best_loss = np.inf
    
    
    for epoch in range(EPOCHS):
        
        start_time = time.time()
        train_loss = train_func(model,
                                optimizer,
                                scheduler, 
                                loss_fn, 
                                trainloader,
                                DEVICE, 
                                epoch)
        
        
        valid_loss, valid_cites_preds = valid_func(model,
                                                   loss_fn,
                                                   validloader,
                                                   DEVICE)
        
            
        end_time = time.time()
        print(f"FOLD: {fold+1} | EPOCH:{epoch+1:02d} | train_loss:{train_loss:.6f} | valid_loss:{valid_loss:.6f} | time:{end_time-start_time:.1f}s ")

        if valid_loss < best_loss:
            best_loss = valid_loss
            oof[te] = valid_cites_preds[:, 0]
            torch.save(model.state_dict(),
                       f"{MODEL_DIR}SimpleMLP_{fold+1}.pth")
            
        else:
            continue
            
    
    testdataset = TestDataset(test_title_pca, test_abs_pca, test_doi)
    testloader = torch.utils.data.DataLoader(testdataset,
                                             batch_size=BATCH_SIZE,
                                             shuffle=False)
    
    model = TwoHeadModel(
        num_features_1 = num_features_1,
        num_features_2 = num_features_2,
        num_doi = num_doi,
        num_cites = num_cites,
        hidden_size = hidden_size,
    )
    
    model.load_state_dict(torch.load(f"{MODEL_DIR}SimpleMLP_{fold+1}.pth"))
    model.to(DEVICE)
    
    preds += test_func(model, testloader, DEVICE)[:, 0]/NFOLDS
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 1
FOLD: 1 | EPOCH:01 | train_loss:2.346136 | valid_loss:1.039456 | time:3.7s 
FOLD: 1 | EPOCH:02 | train_loss:0.917533 | valid_loss:0.683612 | time:3.6s 
FOLD: 1 | EPOCH:03 | train_loss:0.863786 | valid_loss:0.680757 | time:3.7s 
FOLD: 1 | EPOCH:04 | train_loss:0.835902 | valid_loss:0.662268 | time:3.3s 
FOLD: 1 | EPOCH:05 | train_loss:0.830839 | valid_loss:0.646799 | time:3.6s 
FOLD: 1 | EPOCH:06 | train_loss:0.819551 | valid_loss:0.679971 | time:3.7s 
FOLD: 1 | EPOCH:07 | train_loss:0.824818 | valid_loss:0.669431 | time:3.4s 
FOLD: 1 | EPOCH:08 | train_loss:0.823782 | valid_loss:0.695271 | time:3.6s 
FOLD: 1 | EPOCH:09 | train_loss:0.826026 | valid_loss:0.667164 | time:3.8s 
FOLD: 1 | EPOCH:10 | train_loss:0.810591 | valid_loss:0.679541 | time:3.5s 
FOLD: 1 | EPOCH:11 | train_loss:0.819718 | valid_loss:0.681857 | time:2.8s 
FOLD: 1 | EPOCH:12 | train_loss:0.800127 | valid_loss:0.672954 | time:2.3s 
FOLD: 1 | EPOCH:13 | train_loss:0.787671 | valid_loss:0.687458 | time:2.5s 
FOLD: 1 | EPOCH:14 | train_loss:0.780256 | valid_loss:0.701777 | time:4.1s 
FOLD: 1 | EPOCH:15 | train_loss:0.752611 | valid_loss:0.711099 | time:3.8s 
FOLD: 1 | EPOCH:16 | train_loss:0.725169 | valid_loss:0.733798 | time:3.6s 
FOLD: 1 | EPOCH:17 | train_loss:0.701010 | valid_loss:0.738370 | time:3.6s 
FOLD: 1 | EPOCH:18 | train_loss:0.690245 | valid_loss:0.758462 | time:3.4s 
FOLD: 1 | EPOCH:19 | train_loss:0.676085 | valid_loss:0.758223 | time:3.8s 
FOLD: 1 | EPOCH:20 | train_loss:0.682744 | valid_loss:0.757722 | time:3.4s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 2
FOLD: 2 | EPOCH:01 | train_loss:2.268256 | valid_loss:0.951040 | time:3.5s 
FOLD: 2 | EPOCH:02 | train_loss:0.923570 | valid_loss:0.699678 | time:3.4s 
FOLD: 2 | EPOCH:03 | train_loss:0.849738 | valid_loss:0.681367 | time:3.6s 
FOLD: 2 | EPOCH:04 | train_loss:0.840315 | valid_loss:0.675227 | time:3.1s 
FOLD: 2 | EPOCH:05 | train_loss:0.832623 | valid_loss:0.672421 | time:2.3s 
FOLD: 2 | EPOCH:06 | train_loss:0.822668 | valid_loss:0.678278 | time:2.5s 
FOLD: 2 | EPOCH:07 | train_loss:0.822654 | valid_loss:0.685254 | time:4.0s 
FOLD: 2 | EPOCH:08 | train_loss:0.812432 | valid_loss:0.658895 | time:3.5s 
FOLD: 2 | EPOCH:09 | train_loss:0.807763 | valid_loss:0.682751 | time:3.3s 
FOLD: 2 | EPOCH:10 | train_loss:0.804930 | valid_loss:0.687398 | time:3.3s 
FOLD: 2 | EPOCH:11 | train_loss:0.800834 | valid_loss:0.687234 | time:3.4s 
FOLD: 2 | EPOCH:12 | train_loss:0.786160 | valid_loss:0.708299 | time:3.2s 
FOLD: 2 | EPOCH:13 | train_loss:0.767990 | valid_loss:0.704966 | time:3.7s 
FOLD: 2 | EPOCH:14 | train_loss:0.757462 | valid_loss:0.721444 | time:3.8s 
FOLD: 2 | EPOCH:15 | train_loss:0.738536 | valid_loss:0.744819 | time:3.2s 
FOLD: 2 | EPOCH:16 | train_loss:0.719331 | valid_loss:0.748718 | time:3.5s 
FOLD: 2 | EPOCH:17 | train_loss:0.691645 | valid_loss:0.771285 | time:3.4s 
FOLD: 2 | EPOCH:18 | train_loss:0.684977 | valid_loss:0.778157 | time:2.8s 
FOLD: 2 | EPOCH:19 | train_loss:0.680941 | valid_loss:0.774300 | time:2.3s 
FOLD: 2 | EPOCH:20 | train_loss:0.665683 | valid_loss:0.774228 | time:2.7s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 3
FOLD: 3 | EPOCH:01 | train_loss:2.292497 | valid_loss:0.962321 | time:4.3s 
FOLD: 3 | EPOCH:02 | train_loss:0.924057 | valid_loss:0.701687 | time:3.9s 
FOLD: 3 | EPOCH:03 | train_loss:0.854219 | valid_loss:0.699506 | time:3.9s 
FOLD: 3 | EPOCH:04 | train_loss:0.837900 | valid_loss:0.684892 | time:3.5s 
FOLD: 3 | EPOCH:05 | train_loss:0.835563 | valid_loss:0.665736 | time:3.7s 
FOLD: 3 | EPOCH:06 | train_loss:0.827823 | valid_loss:0.665580 | time:3.5s 
FOLD: 3 | EPOCH:07 | train_loss:0.810266 | valid_loss:0.675681 | time:3.9s 
FOLD: 3 | EPOCH:08 | train_loss:0.813502 | valid_loss:0.681831 | time:4.0s 
FOLD: 3 | EPOCH:09 | train_loss:0.811910 | valid_loss:0.671052 | time:3.9s 
FOLD: 3 | EPOCH:10 | train_loss:0.804153 | valid_loss:0.670744 | time:3.6s 
FOLD: 3 | EPOCH:11 | train_loss:0.813063 | valid_loss:0.687331 | time:3.4s 
FOLD: 3 | EPOCH:12 | train_loss:0.797916 | valid_loss:0.707993 | time:4.3s 
FOLD: 3 | EPOCH:13 | train_loss:0.794149 | valid_loss:0.716200 | time:4.6s 
FOLD: 3 | EPOCH:14 | train_loss:0.763399 | valid_loss:0.712297 | time:4.5s 
FOLD: 3 | EPOCH:15 | train_loss:0.750802 | valid_loss:0.730693 | time:4.1s 
FOLD: 3 | EPOCH:16 | train_loss:0.717734 | valid_loss:0.743486 | time:3.6s 
FOLD: 3 | EPOCH:17 | train_loss:0.695795 | valid_loss:0.761950 | time:3.6s 
FOLD: 3 | EPOCH:18 | train_loss:0.681331 | valid_loss:0.768960 | time:3.5s 
FOLD: 3 | EPOCH:19 | train_loss:0.670938 | valid_loss:0.774660 | time:3.5s 
FOLD: 3 | EPOCH:20 | train_loss:0.674210 | valid_loss:0.769835 | time:3.6s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 4
FOLD: 4 | EPOCH:01 | train_loss:2.253462 | valid_loss:0.955478 | time:3.6s 
FOLD: 4 | EPOCH:02 | train_loss:0.922848 | valid_loss:0.720584 | time:3.5s 
FOLD: 4 | EPOCH:03 | train_loss:0.850577 | valid_loss:0.666364 | time:3.6s 
FOLD: 4 | EPOCH:04 | train_loss:0.829753 | valid_loss:0.694528 | time:3.6s 
FOLD: 4 | EPOCH:05 | train_loss:0.829714 | valid_loss:0.688441 | time:3.6s 
FOLD: 4 | EPOCH:06 | train_loss:0.811148 | valid_loss:0.667915 | time:3.6s 
FOLD: 4 | EPOCH:07 | train_loss:0.816485 | valid_loss:0.677920 | time:3.5s 
FOLD: 4 | EPOCH:08 | train_loss:0.808134 | valid_loss:0.680463 | time:3.5s 
FOLD: 4 | EPOCH:09 | train_loss:0.811411 | valid_loss:0.679146 | time:3.6s 
FOLD: 4 | EPOCH:10 | train_loss:0.818291 | valid_loss:0.691657 | time:3.6s 
FOLD: 4 | EPOCH:11 | train_loss:0.807316 | valid_loss:0.692099 | time:3.7s 
FOLD: 4 | EPOCH:12 | train_loss:0.798671 | valid_loss:0.691802 | time:3.6s 
FOLD: 4 | EPOCH:13 | train_loss:0.778866 | valid_loss:0.697141 | time:3.6s 
FOLD: 4 | EPOCH:14 | train_loss:0.765021 | valid_loss:0.715104 | time:3.5s 
FOLD: 4 | EPOCH:15 | train_loss:0.746003 | valid_loss:0.736405 | time:3.7s 
FOLD: 4 | EPOCH:16 | train_loss:0.724310 | valid_loss:0.744852 | time:3.6s 
FOLD: 4 | EPOCH:17 | train_loss:0.695327 | valid_loss:0.749416 | time:3.5s 
FOLD: 4 | EPOCH:18 | train_loss:0.695899 | valid_loss:0.771669 | time:3.5s 
FOLD: 4 | EPOCH:19 | train_loss:0.685250 | valid_loss:0.772929 | time:3.5s 
FOLD: 4 | EPOCH:20 | train_loss:0.670131 | valid_loss:0.772308 | time:3.5s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 5
FOLD: 5 | EPOCH:01 | train_loss:2.285175 | valid_loss:0.945952 | time:3.5s 
FOLD: 5 | EPOCH:02 | train_loss:0.927173 | valid_loss:0.718184 | time:3.5s 
FOLD: 5 | EPOCH:03 | train_loss:0.847746 | valid_loss:0.673975 | time:3.5s 
FOLD: 5 | EPOCH:04 | train_loss:0.837297 | valid_loss:0.669784 | time:3.7s 
FOLD: 5 | EPOCH:05 | train_loss:0.821963 | valid_loss:0.665123 | time:3.6s 
FOLD: 5 | EPOCH:06 | train_loss:0.819732 | valid_loss:0.669295 | time:3.4s 
FOLD: 5 | EPOCH:07 | train_loss:0.811684 | valid_loss:0.677448 | time:3.5s 
FOLD: 5 | EPOCH:08 | train_loss:0.820014 | valid_loss:0.672548 | time:3.5s 
FOLD: 5 | EPOCH:09 | train_loss:0.810965 | valid_loss:0.678159 | time:3.5s 
FOLD: 5 | EPOCH:10 | train_loss:0.799571 | valid_loss:0.690159 | time:3.5s 
FOLD: 5 | EPOCH:11 | train_loss:0.807466 | valid_loss:0.679499 | time:3.8s 
FOLD: 5 | EPOCH:12 | train_loss:0.787137 | valid_loss:0.693354 | time:3.7s 
FOLD: 5 | EPOCH:13 | train_loss:0.784178 | valid_loss:0.713389 | time:4.0s 
FOLD: 5 | EPOCH:14 | train_loss:0.766159 | valid_loss:0.709375 | time:2.4s 
FOLD: 5 | EPOCH:15 | train_loss:0.747560 | valid_loss:0.723371 | time:3.0s 
FOLD: 5 | EPOCH:16 | train_loss:0.721054 | valid_loss:0.735085 | time:2.9s 
FOLD: 5 | EPOCH:17 | train_loss:0.709306 | valid_loss:0.748228 | time:2.7s 
FOLD: 5 | EPOCH:18 | train_loss:0.700590 | valid_loss:0.756795 | time:2.6s 
FOLD: 5 | EPOCH:19 | train_loss:0.691556 | valid_loss:0.750856 | time:2.3s 
FOLD: 5 | EPOCH:20 | train_loss:0.694626 | valid_loss:0.746628 | time:2.5s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 6
FOLD: 6 | EPOCH:01 | train_loss:2.284253 | valid_loss:0.937912 | time:2.3s 
FOLD: 6 | EPOCH:02 | train_loss:0.919130 | valid_loss:0.699395 | time:2.3s 
FOLD: 6 | EPOCH:03 | train_loss:0.848985 | valid_loss:0.688490 | time:2.3s 
FOLD: 6 | EPOCH:04 | train_loss:0.838046 | valid_loss:0.695203 | time:2.2s 
FOLD: 6 | EPOCH:05 | train_loss:0.828637 | valid_loss:0.691719 | time:2.4s 
FOLD: 6 | EPOCH:06 | train_loss:0.823461 | valid_loss:0.684958 | time:2.4s 
FOLD: 6 | EPOCH:07 | train_loss:0.807477 | valid_loss:0.686002 | time:2.4s 
FOLD: 6 | EPOCH:08 | train_loss:0.814183 | valid_loss:0.696132 | time:2.5s 
FOLD: 6 | EPOCH:09 | train_loss:0.801512 | valid_loss:0.703734 | time:2.4s 
FOLD: 6 | EPOCH:10 | train_loss:0.813860 | valid_loss:0.703319 | time:2.3s 
FOLD: 6 | EPOCH:11 | train_loss:0.798274 | valid_loss:0.686019 | time:2.4s 
FOLD: 6 | EPOCH:12 | train_loss:0.798365 | valid_loss:0.698568 | time:2.3s 
FOLD: 6 | EPOCH:13 | train_loss:0.777323 | valid_loss:0.708754 | time:2.2s 
FOLD: 6 | EPOCH:14 | train_loss:0.765338 | valid_loss:0.711898 | time:2.2s 
FOLD: 6 | EPOCH:15 | train_loss:0.747511 | valid_loss:0.738144 | time:2.2s 
FOLD: 6 | EPOCH:16 | train_loss:0.726277 | valid_loss:0.749122 | time:2.2s 
FOLD: 6 | EPOCH:17 | train_loss:0.704968 | valid_loss:0.751241 | time:2.2s 
FOLD: 6 | EPOCH:18 | train_loss:0.690608 | valid_loss:0.755787 | time:2.2s 
FOLD: 6 | EPOCH:19 | train_loss:0.670668 | valid_loss:0.765387 | time:2.2s 
FOLD: 6 | EPOCH:20 | train_loss:0.673743 | valid_loss:0.786040 | time:2.2s 
★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★★
Fold: 7
FOLD: 7 | EPOCH:01 | train_loss:2.296701 | valid_loss:0.924307 | time:2.2s 
FOLD: 7 | EPOCH:02 | train_loss:0.933435 | valid_loss:0.689047 | time:2.2s 
FOLD: 7 | EPOCH:03 | train_loss:0.850991 | valid_loss:0.659872 | time:2.2s 
FOLD: 7 | EPOCH:04 | train_loss:0.841571 | valid_loss:0.671413 | time:2.2s 
FOLD: 7 | EPOCH:05 | train_loss:0.818740 | valid_loss:0.685985 | time:2.2s 
FOLD: 7 | EPOCH:06 | train_loss:0.826990 | valid_loss:0.676457 | time:2.2s 
FOLD: 7 | EPOCH:07 | train_loss:0.809806 | valid_loss:0.684370 | time:2.1s 
FOLD: 7 | EPOCH:08 | train_loss:0.818394 | valid_loss:0.666673 | time:2.2s 
FOLD: 7 | EPOCH:09 | train_loss:0.811473 | valid_loss:0.664205 | time:2.2s 
FOLD: 7 | EPOCH:10 | train_loss:0.818371 | valid_loss:0.674103 | time:2.2s 
FOLD: 7 | EPOCH:11 | train_loss:0.807668 | valid_loss:0.651558 | time:2.2s 
FOLD: 7 | EPOCH:12 | train_loss:0.799247 | valid_loss:0.674729 | time:2.2s 
FOLD: 7 | EPOCH:13 | train_loss:0.785723 | valid_loss:0.677723 | time:2.1s 
FOLD: 7 | EPOCH:14 | train_loss:0.765225 | valid_loss:0.722652 | time:2.2s 
FOLD: 7 | EPOCH:15 | train_loss:0.748908 | valid_loss:0.725328 | time:2.1s 
FOLD: 7 | EPOCH:16 | train_loss:0.724189 | valid_loss:0.737869 | time:2.1s 
FOLD: 7 | EPOCH:17 | train_loss:0.702457 | valid_loss:0.759617 | time:2.2s 
FOLD: 7 | EPOCH:18 | train_loss:0.690418 | valid_loss:0.767010 | time:2.2s 
FOLD: 7 | EPOCH:19 | train_loss:0.680520 | valid_loss:0.763514 | time:2.2s 
FOLD: 7 | EPOCH:20 | train_loss:0.668650 | valid_loss:0.771165 | time:2.2s 
rmse = np.sqrt(mean_squared_error(target, oof))
print(f'CV : {rmse:.6f} ')
CV : 0.668040 
fig, ax = plt.subplots(figsize=(8, 5))
sns.histplot(target[:, 0], label='正解値', kde=True, color='orange',
             stat="density", common_norm=False, alpha=0.3)
sns.histplot(oof, label='OOF', kde=True,
             stat="density", common_norm=False, alpha=0.3)
ax.legend()
ax.grid()
plt.show()
  • 平均値付近に固まってる
### 予測値の統計量を確認
pd.DataFrame(np.expm1(preds)).describe().T
count mean std min 25% 50% 75% max
0 59084.0 16.914278 17.470612 0.941772 6.85601 12.137255 21.098681 690.938788

oofとsubを保存

def make_oof_sub(oof:np.array, pred:np.array, name:str):
    
    oof_res = np.expm1(oof)
    oof_res = pd.DataFrame(oof_res, columns=['OOF'])
    oof_res.to_csv(f'{OUTPUT_DIR}{name}_oof.csv', index=False)
    print(f'{OUTPUT_DIR}{name}_oof.csv is saved.')
    
    preds_res = np.expm1(pred)
    test_df = pd.read_json(os.path.join(INPUT_DIR, 'test_data.json'), lines=True)
    preds_res = pd.concat([pd.DataFrame({ 'id': test_df['id'] }), 
                           pd.DataFrame({ 'cites': preds_res })], 
                          axis=1)
    preds_res.to_csv(f'{OUTPUT_DIR}{name}_sub.csv', index=False)
    print(f'{OUTPUT_DIR}{name}_sub.csv is saved.')
with timer('make oof sub'):
    make_oof_sub(oof, preds, 'Simple_NN')
<< make oof sub >> Start
../data/interim/2021_03_04/Simple_NN_oof.csv is saved.
../data/interim/2021_03_04/Simple_NN_sub.csv is saved.
<< make oof sub >> 5.4GB(+0.0GB):3.7sec

添付データ

  • NN_Baseline.ipynb?X-Amz-Expires=10800&X-Amz-Date=20240420T094509Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。