Model (LightGBM Base)

準備

以下のように構成します

MyDrive
├<pollen_counts>
   ├<notebook>
   │ └run.ipynb
   ├<input>
   │ ├train.csv
   │ ├submission.csv
   │ └test.csv
   └<output>
# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm

import warnings
warnings.simplefilter('ignore')

# mount
from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive')
Mounted at /content/drive
# Config
DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/pollen_counts"
INPUT = os.path.join(DRIVE_PATH, "input")
OUTPUT = os.path.join(DRIVE_PATH, "output")

TRAIN_FILE = os.path.join(INPUT, "train.csv")
TEST_FILE = os.path.join(INPUT, "test.csv")
SUB_FILE = os.path.join(INPUT, "submission.csv")

exp_name = 'exp000'
seed = 42

# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'
# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)
# object(欠測) -> float
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

train_df = train.replace('欠測', np.nan)
lgb_imp = IterativeImputer(
                       estimator=LGBMRegressor(num_boost_round=1000, random_state=seed),
                       max_iter=10, 
                       initial_strategy='mean',
                       imputation_order='ascending',
                       verbose=1,
                       random_state=seed)

train_df = pd.DataFrame(lgb_imp.fit_transform(train_df), columns=train_df.columns)
train_df[['winddirection_chiba', 'winddirection_tokyo']] = train_df[['winddirection_chiba', 'winddirection_tokyo']].round().astype(int)
train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']] = train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']].round(1)
train[train.select_dtypes(object).columns] = train_df[train.select_dtypes(object).columns]
train.head(3)
[IterativeImputer] Completing matrix with shape (12168, 16)
[IterativeImputer] Change: 8.828120105598833, scaled tolerance: 2020033.124 
[IterativeImputer] Early stopping criterion reached.
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo
0 2017020101 0.0 0.0 0.0 -1.0 4.1 2.9 16 1 2 2.7 2.5 1.3 0.0 8.0 0.0
1 2017020102 0.0 0.0 0.0 -1.1 4.2 2.6 1 1 1 3.3 1.5 0.9 0.0 24.0 4.0
2 2017020103 0.0 0.0 0.0 -0.7 4.2 2.4 1 15 16 4.0 1.7 0.6 4.0 32.0 12.0

特徴量

# 基本時間特徴
def add_time_feat(df):
    df['time'] = pd.to_datetime(df.datetime.astype(str).str[:-2])
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['hour'] = df.datetime.astype(str).str[-2:].astype(int)
    df['weekday'] = df['time'].dt.weekday
    df['day_of_year'] = df['time'].dt.dayofyear
    df['day_of_year'] = df.apply(lambda x: x['day_of_year']-1 if (x['time'] > pd.Timestamp('2020-02-29')) else x['day_of_year'], axis=1)
    df['day_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365))
    df['day_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365))
    return df

# ラグ特徴
def add_lag_feat(df, feat:list, group:str):
    outputs = [df]
    grp_df = df.groupby(group)
    for lag in [1, 2, 3, 4, 5]:
      # shift
      outputs.append(grp_df[feat].shift(lag).add_prefix(f'shift{lag}_'))
      # diff
      outputs.append(grp_df[feat].diff(lag).add_prefix(f'diff{lag}_'))
    # rolling
    for window in [3,24]:
        tmp_df = grp_df[feat].rolling(window, min_periods=1)
        tmp_df = tmp_df.mean().add_prefix(f'rolling{window}_mean_')
        outputs.append(tmp_df.reset_index(drop=True))
    return pd.concat(outputs, axis=1)

# 集計特徴
def additional_encoding(train, test, cat_col:list, num_col:list): 
    trdf = train.copy()
    tedf = test.copy()  

    # Count Encoding
    for ccol in cat_col:
        encoder = trdf[(trdf['month']==4)&(trdf['day']<15)][ccol].value_counts()
        trdf[f'ce_{ccol}'] = trdf[ccol].map(encoder)
        tedf[f'ce_{ccol}'] = tedf[ccol].map(encoder)

    # Add Aggregate Features
    agg_cols = ['mean', 'std', 'min', 'max']
    for ccol in cat_col:
        for ncol in num_col:
            agg_df = trdf.groupby(ccol)[ncol].agg(agg_cols)
            agg_df['abs_mean'] = np.abs(agg_df['mean'])
            agg_df['min_max'] = agg_df['min']*agg_df['max']
            agg_df.columns = [f'{ccol}_{c}' for c in agg_df.columns]
            trdf = trdf.merge(agg_df, on=ccol, how='left')
            tedf = tedf.merge(agg_df, on=ccol, how='left')

    return trdf, tedf
def run_add_feat(train, test):
    # 連結して全データに対して処理
    df = pd.concat([train, test]).reset_index(drop=True)

    # 基本時間特徴の追加
    df = add_time_feat(df)

    # 手動特徴の追加
    windd_col = ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
    winds_col = ['windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
    for d, s in zip(windd_col, winds_col):
        df[f'{d}_{s}'] = np.sin(df[windd_col] * (2 * np.pi / 17))[d] * df[s]
    precipitation_col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo']
    
    # ラグ特徴の追加
    feat = [
    'precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo', 
    'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo', 
    'windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo',
    'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo'
    ]
    df = add_lag_feat(df, feat, 'year')

    # train/testに再分割、欠損処理
    train_df = df[:len(train)]
    test_df = df[len(train):]
    train_df = train_df.dropna().reset_index(drop=True)

    # 集計特徴の追加
    cat_columns = ['year', 'month', 'day', 'hour', 'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
    num_columns = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo', 
               'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo', 
               'windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
    train_df, test_df = additional_encoding(train_df, test_df, cat_columns, num_columns)

    return train_df, test_df

train_df, test_df = run_add_feat(train, test)
print(train_df.shape)
display(train_df.head(3))
print(test_df.shape)
display(test_df.head(3))
(12148, 557)
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo time year month day hour weekday day_of_year day_sin day_cos winddirection_utsunomiya_windspeed_utsunomiya winddirection_chiba_windspeed_chiba winddirection_tokyo_windspeed_tokyo shift1_precipitation_utsunomiya shift1_precipitation_chiba shift1_precipitation_tokyo shift1_temperature_utsunomiya shift1_temperature_chiba shift1_temperature_tokyo shift1_windspeed_utsunomiya shift1_windspeed_chiba shift1_windspeed_tokyo shift1_winddirection_utsunomiya shift1_winddirection_chiba shift1_winddirection_tokyo diff1_precipitation_utsunomiya diff1_precipitation_chiba diff1_precipitation_tokyo diff1_temperature_utsunomiya diff1_temperature_chiba diff1_temperature_tokyo diff1_windspeed_utsunomiya diff1_windspeed_chiba diff1_windspeed_tokyo diff1_winddirection_utsunomiya diff1_winddirection_chiba diff1_winddirection_tokyo shift2_precipitation_utsunomiya shift2_precipitation_chiba shift2_precipitation_tokyo shift2_temperature_utsunomiya shift2_temperature_chiba shift2_temperature_tokyo shift2_windspeed_utsunomiya shift2_windspeed_chiba shift2_windspeed_tokyo shift2_winddirection_utsunomiya shift2_winddirection_chiba shift2_winddirection_tokyo diff2_precipitation_utsunomiya diff2_precipitation_chiba diff2_precipitation_tokyo diff2_temperature_utsunomiya diff2_temperature_chiba diff2_temperature_tokyo diff2_windspeed_utsunomiya diff2_windspeed_chiba diff2_windspeed_tokyo diff2_winddirection_utsunomiya diff2_winddirection_chiba diff2_winddirection_tokyo shift3_precipitation_utsunomiya shift3_precipitation_chiba shift3_precipitation_tokyo shift3_temperature_utsunomiya shift3_temperature_chiba shift3_temperature_tokyo shift3_windspeed_utsunomiya shift3_windspeed_chiba shift3_windspeed_tokyo shift3_winddirection_utsunomiya shift3_winddirection_chiba shift3_winddirection_tokyo diff3_precipitation_utsunomiya diff3_precipitation_chiba diff3_precipitation_tokyo diff3_temperature_utsunomiya diff3_temperature_chiba diff3_temperature_tokyo diff3_windspeed_utsunomiya diff3_windspeed_chiba diff3_windspeed_tokyo diff3_winddirection_utsunomiya diff3_winddirection_chiba diff3_winddirection_tokyo ... winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean_x winddirection_chiba_std_x winddirection_chiba_min_x winddirection_chiba_max_x winddirection_chiba_abs_mean_x winddirection_chiba_min_max_x winddirection_chiba_mean_y winddirection_chiba_std_y winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean_x winddirection_chiba_std_x winddirection_chiba_min_x winddirection_chiba_max_x winddirection_chiba_abs_mean_x winddirection_chiba_min_max_x winddirection_chiba_mean_y winddirection_chiba_std_y winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean_x winddirection_chiba_std_x winddirection_chiba_min_x winddirection_chiba_max_x winddirection_chiba_abs_mean_x winddirection_chiba_min_max_x winddirection_chiba_mean_y winddirection_chiba_std_y winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean winddirection_chiba_std winddirection_chiba_min winddirection_chiba_max winddirection_chiba_abs_mean winddirection_chiba_min_max winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean winddirection_tokyo_std winddirection_tokyo_min winddirection_tokyo_max winddirection_tokyo_abs_mean winddirection_tokyo_min_max
0 2017020106 0.0 0.0 0.0 -1.8 4.0 1.1 1 15 15 2.6 2.3 1.0 4.0 4.0 0.0 2017-02-01 2017 2 1 6 2 32 0.523416 0.852078 0.939228 -1.549500 -0.673696 0.0 0.0 0.0 -1.2 4.1 1.5 3.7 3.4 0.9 2.0 14.0 14.0 0.0 0.0 0.0 -0.6 -0.1 -0.4 -1.1 -1.1 0.1 -1.0 1.0 1.0 0.0 0.0 0.0 -1.1 4.4 1.8 4.1 3.1 1.4 1.0 15.0 1.0 0.0 0.0 0.0 -0.7 -0.4 -0.7 -1.5 -0.8 -0.4 0.0 0.0 14.0 0.0 0.0 0.0 -0.7 4.2 2.4 4.0 1.7 0.6 1.0 15.0 16.0 0.0 0.0 0.0 -1.1 -0.2 -1.3 -1.4 0.6 0.4 0.0 0.0 -1.0 ... 0.0 15.0 0.205139 0.0 0.152906 0.636507 0.0 8.5 0.152906 0.0 6.382730 6.217927 -6.5 28.3 6.382730 -183.95 9.277759 5.565696 -0.9 28.6 9.277759 -25.74 7.740775 5.979493 -4.0 29.6 7.740775 -118.40 3.165965 2.144589 0.1 12.9 3.165965 1.29 3.415754 1.668560 0.3 11.4 3.415754 3.42 1.376327 0.747432 0.0 4.1 1.376327 0.0 0.234670 0.812608 0.0 9.0 0.234670 0.0 0.369693 1.175232 0.0 13.5 0.369693 0.0 0.375590 1.554642 0.0 21.5 0.375590 0.0 8.883962 6.598716 -3.8 27.1 8.883962 -102.98 11.517571 6.004159 -0.1 28.2 11.517571 -2.82 10.191627 6.179070 -2.3 29.0 10.191627 -66.70 3.314151 2.200207 0.1 11.8 3.314151 1.18 3.909434 2.321321 0.2 15.3 3.909434 3.06 1.384552 0.718821 0.3 4.5 1.384552 1.35
1 2017020107 0.0 0.0 0.0 -2.1 3.7 0.7 16 15 14 2.9 1.8 1.3 0.0 12.0 0.0 2017-02-01 2017 2 1 7 2 32 0.523416 0.852078 -1.047601 -1.212652 -1.163712 0.0 0.0 0.0 -1.8 4.0 1.1 2.6 2.3 1.0 1.0 15.0 15.0 0.0 0.0 0.0 -0.3 -0.3 -0.4 0.3 -0.5 0.3 15.0 0.0 -1.0 0.0 0.0 0.0 -1.2 4.1 1.5 3.7 3.4 0.9 2.0 14.0 14.0 0.0 0.0 0.0 -0.9 -0.4 -0.8 -0.8 -1.6 0.4 14.0 1.0 0.0 0.0 0.0 0.0 -1.1 4.4 1.8 4.1 3.1 1.4 1.0 15.0 1.0 0.0 0.0 0.0 -1.0 -0.7 -1.1 -1.2 -1.3 -0.1 15.0 0.0 13.0 ... 0.0 15.0 0.205139 0.0 0.152906 0.636507 0.0 8.5 0.152906 0.0 6.382730 6.217927 -6.5 28.3 6.382730 -183.95 9.277759 5.565696 -0.9 28.6 9.277759 -25.74 7.740775 5.979493 -4.0 29.6 7.740775 -118.40 3.165965 2.144589 0.1 12.9 3.165965 1.29 3.415754 1.668560 0.3 11.4 3.415754 3.42 1.376327 0.747432 0.0 4.1 1.376327 0.0 0.221142 0.814820 0.0 11.5 0.221142 0.0 0.346902 1.430341 0.0 21.5 0.346902 0.0 0.351762 1.199612 0.0 14.0 0.351762 0.0 8.993439 6.411261 -6.5 27.5 8.993439 -178.75 11.705225 5.986661 -1.4 28.8 11.705225 -40.32 10.035966 6.177933 -2.9 29.7 10.035966 -86.13 3.147388 2.141114 0.1 13.9 3.147388 1.39 3.515796 2.327821 0.2 14.4 3.515796 2.88 1.365249 0.805476 0.3 5.1 1.365249 1.53
2 2017020108 0.0 0.0 0.0 -0.2 3.9 2.0 16 14 14 2.2 2.3 1.0 8.0 16.0 8.0 2017-02-01 2017 2 1 8 2 32 0.523416 0.852078 -0.794732 -2.058876 -0.895163 0.0 0.0 0.0 -2.1 3.7 0.7 2.9 1.8 1.3 16.0 15.0 14.0 0.0 0.0 0.0 1.9 0.2 1.3 -0.7 0.5 -0.3 0.0 -1.0 0.0 0.0 0.0 0.0 -1.8 4.0 1.1 2.6 2.3 1.0 1.0 15.0 15.0 0.0 0.0 0.0 1.6 -0.1 0.9 -0.4 0.0 0.0 15.0 -1.0 -1.0 0.0 0.0 0.0 -1.2 4.1 1.5 3.7 3.4 0.9 2.0 14.0 14.0 0.0 0.0 0.0 1.0 -0.2 0.5 -1.5 -1.1 0.1 14.0 0.0 0.0 ... 0.0 7.5 0.050968 0.0 0.034839 0.251762 0.0 4.0 0.034839 0.0 7.310581 5.953827 -4.6 27.8 7.310581 -127.88 10.326323 5.332706 -0.1 29.2 10.326323 -2.92 8.772774 5.742079 -4.3 30.1 8.772774 -129.43 3.823226 2.716188 0.2 15.9 3.823226 3.18 4.774323 2.622473 0.3 13.2 4.774323 3.96 1.637032 0.915166 0.0 5.1 1.637032 0.0 0.221142 0.814820 0.0 11.5 0.221142 0.0 0.346902 1.430341 0.0 21.5 0.346902 0.0 0.351762 1.199612 0.0 14.0 0.351762 0.0 8.993439 6.411261 -6.5 27.5 8.993439 -178.75 11.705225 5.986661 -1.4 28.8 11.705225 -40.32 10.035966 6.177933 -2.9 29.7 10.035966 -86.13 3.147388 2.141114 0.1 13.9 3.147388 1.39 3.515796 2.327821 0.2 14.4 3.515796 2.88 1.365249 0.805476 0.3 5.1 1.365249 1.53

3 rows × 557 columns

(336, 557)
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo time year month day hour weekday day_of_year day_sin day_cos winddirection_utsunomiya_windspeed_utsunomiya winddirection_chiba_windspeed_chiba winddirection_tokyo_windspeed_tokyo shift1_precipitation_utsunomiya shift1_precipitation_chiba shift1_precipitation_tokyo shift1_temperature_utsunomiya shift1_temperature_chiba shift1_temperature_tokyo shift1_windspeed_utsunomiya shift1_windspeed_chiba shift1_windspeed_tokyo shift1_winddirection_utsunomiya shift1_winddirection_chiba shift1_winddirection_tokyo diff1_precipitation_utsunomiya diff1_precipitation_chiba diff1_precipitation_tokyo diff1_temperature_utsunomiya diff1_temperature_chiba diff1_temperature_tokyo diff1_windspeed_utsunomiya diff1_windspeed_chiba diff1_windspeed_tokyo diff1_winddirection_utsunomiya diff1_winddirection_chiba diff1_winddirection_tokyo shift2_precipitation_utsunomiya shift2_precipitation_chiba shift2_precipitation_tokyo shift2_temperature_utsunomiya shift2_temperature_chiba shift2_temperature_tokyo shift2_windspeed_utsunomiya shift2_windspeed_chiba shift2_windspeed_tokyo shift2_winddirection_utsunomiya shift2_winddirection_chiba shift2_winddirection_tokyo diff2_precipitation_utsunomiya diff2_precipitation_chiba diff2_precipitation_tokyo diff2_temperature_utsunomiya diff2_temperature_chiba diff2_temperature_tokyo diff2_windspeed_utsunomiya diff2_windspeed_chiba diff2_windspeed_tokyo diff2_winddirection_utsunomiya diff2_winddirection_chiba diff2_winddirection_tokyo shift3_precipitation_utsunomiya shift3_precipitation_chiba shift3_precipitation_tokyo shift3_temperature_utsunomiya shift3_temperature_chiba shift3_temperature_tokyo shift3_windspeed_utsunomiya shift3_windspeed_chiba shift3_windspeed_tokyo shift3_winddirection_utsunomiya shift3_winddirection_chiba shift3_winddirection_tokyo diff3_precipitation_utsunomiya diff3_precipitation_chiba diff3_precipitation_tokyo diff3_temperature_utsunomiya diff3_temperature_chiba diff3_temperature_tokyo diff3_windspeed_utsunomiya diff3_windspeed_chiba diff3_windspeed_tokyo diff3_winddirection_utsunomiya diff3_winddirection_chiba diff3_winddirection_tokyo ... winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean_x winddirection_chiba_std_x winddirection_chiba_min_x winddirection_chiba_max_x winddirection_chiba_abs_mean_x winddirection_chiba_min_max_x winddirection_chiba_mean_y winddirection_chiba_std_y winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean_x winddirection_chiba_std_x winddirection_chiba_min_x winddirection_chiba_max_x winddirection_chiba_abs_mean_x winddirection_chiba_min_max_x winddirection_chiba_mean_y winddirection_chiba_std_y winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean_x winddirection_chiba_std_x winddirection_chiba_min_x winddirection_chiba_max_x winddirection_chiba_abs_mean_x winddirection_chiba_min_max_x winddirection_chiba_mean_y winddirection_chiba_std_y winddirection_chiba_min_y winddirection_chiba_max_y winddirection_chiba_abs_mean_y winddirection_chiba_min_max_y winddirection_chiba_mean winddirection_chiba_std winddirection_chiba_min winddirection_chiba_max winddirection_chiba_abs_mean winddirection_chiba_min_max winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean_x winddirection_tokyo_std_x winddirection_tokyo_min_x winddirection_tokyo_max_x winddirection_tokyo_abs_mean_x winddirection_tokyo_min_max_x winddirection_tokyo_mean_y winddirection_tokyo_std_y winddirection_tokyo_min_y winddirection_tokyo_max_y winddirection_tokyo_abs_mean_y winddirection_tokyo_min_max_y winddirection_tokyo_mean winddirection_tokyo_std winddirection_tokyo_min winddirection_tokyo_max winddirection_tokyo_abs_mean winddirection_tokyo_min_max
0 2020040101 0.0 0.0 0.0 9.5 10.5 9.0 14 2 14 2.1 2.3 1.2 0.0 0.0 0.0 2020-04-01 2020 4 1 1 2 91 0.999991 0.004304 -1.879843 1.549500 -1.074196 0.0 0.0 0.0 9.7 10.7 8.9 1.0 2.7 0.4 16.0 1.0 16.0 0.0 0.0 0.0 -0.2 -0.2 0.1 1.1 -0.4 0.8 -2.0 1.0 -2.0 0.5 0.0 0.0 9.7 10.9 8.9 0.5 2.9 0.6 16.0 16.0 1.0 -0.5 0.0 0.0 -0.2 -0.4 0.1 1.6 -0.6 0.6 -2.0 -14.0 13.0 0.0 0.0 0.0 9.8 11.3 8.8 1.2 2.7 0.9 3.0 15.0 15.0 0.0 0.0 0.0 -0.3 -0.8 0.2 0.9 -0.4 0.3 11.0 -13.0 -1.0 ... 0.0 11.5 0.209054 0.0 0.213123 0.830675 0.0 10.0 0.213123 0.0 11.105697 6.529016 -6.1 27.5 11.105697 -167.75 12.908850 5.756066 -1.4 27.5 12.908850 -38.5 12.245575 6.104280 -3.0 28.4 12.245575 -85.2 2.683316 1.556318 0.0 11.2 2.683316 0.00 3.287080 1.379972 0.3 8.2 3.287080 2.46 1.478332 0.817158 0.0 4.7 1.478332 0.0 0.221142 0.81482 0.0 11.5 0.221142 0.0 0.346902 1.430341 0.0 21.5 0.346902 0.0 0.351762 1.199612 0.0 14.0 0.351762 0.0 8.993439 6.411261 -6.5 27.5 8.993439 -178.75 11.705225 5.986661 -1.4 28.8 11.705225 -40.32 10.035966 6.177933 -2.9 29.7 10.035966 -86.13 3.147388 2.141114 0.1 13.9 3.147388 1.39 3.515796 2.327821 0.2 14.4 3.515796 2.88 1.365249 0.805476 0.3 5.1 1.365249 1.53
1 2020040102 0.0 0.0 0.0 9.2 10.3 9.0 2 16 14 1.4 2.7 0.8 0.0 0.0 0.0 2020-04-01 2020 4 1 2 2 91 0.999991 0.004304 0.943174 -0.975352 -0.716131 0.0 0.0 0.0 9.5 10.5 9.0 2.1 2.3 1.2 14.0 2.0 14.0 0.0 0.0 0.0 -0.3 -0.2 0.0 -0.7 0.4 -0.4 -12.0 14.0 0.0 0.0 0.0 0.0 9.7 10.7 8.9 1.0 2.7 0.4 16.0 1.0 16.0 0.0 0.0 0.0 -0.5 -0.4 0.1 0.4 0.0 0.4 -14.0 15.0 -2.0 0.5 0.0 0.0 9.7 10.9 8.9 0.5 2.9 0.6 16.0 16.0 1.0 -0.5 0.0 0.0 -0.5 -0.6 0.1 0.9 -0.2 0.2 -14.0 0.0 13.0 ... 0.0 9.5 0.292402 0.0 0.248982 0.832885 0.0 6.5 0.248982 0.0 7.495387 6.633393 -4.7 27.5 7.495387 -129.25 9.779376 5.937938 -0.2 29.5 9.779376 -5.9 8.547490 6.377572 -4.0 30.6 8.547490 -122.4 3.129715 1.955545 0.2 11.6 3.129715 2.32 2.229851 1.017832 0.3 7.1 2.229851 2.13 1.297015 0.680167 0.0 4.2 1.297015 0.0 0.221142 0.81482 0.0 11.5 0.221142 0.0 0.346902 1.430341 0.0 21.5 0.346902 0.0 0.351762 1.199612 0.0 14.0 0.351762 0.0 8.993439 6.411261 -6.5 27.5 8.993439 -178.75 11.705225 5.986661 -1.4 28.8 11.705225 -40.32 10.035966 6.177933 -2.9 29.7 10.035966 -86.13 3.147388 2.141114 0.1 13.9 3.147388 1.39 3.515796 2.327821 0.2 14.4 3.515796 2.88 1.365249 0.805476 0.3 5.1 1.365249 1.53
2 2020040103 0.0 0.0 0.0 9.2 10.2 9.1 16 16 12 3.3 2.5 0.5 0.0 0.0 0.0 2020-04-01 2020 4 1 3 2 91 0.999991 0.004304 -1.192097 -0.903104 -0.480913 0.0 0.0 0.0 9.2 10.3 9.0 1.4 2.7 0.8 2.0 16.0 14.0 0.0 0.0 0.0 0.0 -0.1 0.1 1.9 -0.2 -0.3 14.0 0.0 -2.0 0.0 0.0 0.0 9.5 10.5 9.0 2.1 2.3 1.2 14.0 2.0 14.0 0.0 0.0 0.0 -0.3 -0.3 0.1 1.2 0.2 -0.7 2.0 14.0 -2.0 0.0 0.0 0.0 9.7 10.7 8.9 1.0 2.7 0.4 16.0 1.0 16.0 0.0 0.0 0.0 -0.5 -0.5 0.2 2.3 -0.2 0.1 0.0 15.0 -4.0 ... 0.0 9.5 0.292402 0.0 0.248982 0.832885 0.0 6.5 0.248982 0.0 7.495387 6.633393 -4.7 27.5 7.495387 -129.25 9.779376 5.937938 -0.2 29.5 9.779376 -5.9 8.547490 6.377572 -4.0 30.6 8.547490 -122.4 3.129715 1.955545 0.2 11.6 3.129715 2.32 2.229851 1.017832 0.3 7.1 2.229851 2.13 1.297015 0.680167 0.0 4.2 1.297015 0.0 0.031306 0.41737 0.0 9.0 0.031306 0.0 0.039356 0.504699 0.0 11.0 0.039356 0.0 0.075134 1.035953 0.0 23.5 0.075134 0.0 8.798748 7.142278 -5.0 28.8 8.798748 -144.00 12.214311 6.118142 0.0 27.1 12.214311 0.00 9.931664 6.817144 -4.0 31.2 9.931664 -124.80 2.359034 1.559301 0.1 11.8 2.359034 1.18 2.782826 2.078830 0.2 14.4 2.782826 2.88 1.005546 0.538471 0.3 4.8 1.005546 1.44

3 rows × 557 columns

モデルの設定

from sklearn.metrics import mean_absolute_error as mae

import lightgbm as lgb

import os
import random
import tensorflow as tf
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

# param
seed=42
plot_mode=False

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
# LightGBM
class ModelLgb:

    def __init__(self, plot: bool):
        self.model = None
        self.plot = plot

    def fit(self, tr_x, tr_y, va_x=None, va_y=None):
        params = {
        'objective':'regression',
        'boosting':'gbdt',
        'metric':'mae',
        'seed': seed,
        'verbosity':-1,
        'learning_rate':0.1,
        }
        
        num_round = 10000
        early_stopping_rounds=50
        # validation
        if va_x is not None:
            lgb_train = lgb.Dataset(tr_x, tr_y)
            lgb_eval = lgb.Dataset(va_x, va_y)
            
            self.model = lgb.train(params, lgb_train, valid_sets=lgb_eval, num_boost_round=num_round, verbose_eval=0,
                                  callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False)]
                                  )
        else:
            # No validation
            lgb_train = lgb.Dataset(tr_x, tr_y)
            self.model = lgb.train(params, lgb_train, num_boost_round=100, verbose_eval=0)

        # plot feature importance
        if self.plot:
            f_importance = np.array(self.model.feature_importance())
            df_importance = pd.DataFrame({'feat':tr_x.columns, 'importance':f_importance})
            df_importance = df_importance.sort_values('importance', ascending=True)
            plt.figure(figsize=(8,12))
            plt.barh('feat', 'importance', data=df_importance.iloc[-30:])
            plt.show()   
        
    def predict(self, x):
        pred = self.model.predict(x, num_iteration=self.model.best_iteration)
        return pred

学習と予測の実行

予測するTargetについて

plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
plt.grid()
for i, col in enumerate(plot_col):
    plt.subplot(1, ncols, i+1)
    train_df['pollen_chiba'].hist(range=(0,50), bins=50, alpha=1, color=color[i], label=col)
    plt.legend()
plt.show()

Targetの値はどれも4の倍数になっているため、予測後の後処理で4の倍数に揃えるとよさそう

提出後のLBを比較する

単純な予測と4の倍数のみの提出スコアを比較

target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']

# Sub1(簡易の予測)
results_sub1 = dict()
set_seed(seed)
for tcol in tqdm(target_columns):
    train_tmp = train_df.copy()
    test_tmp = test_df.copy()
    va_preds = []
    test_preds = []
    losses = []
    for i, year in enumerate([2017, 2018, 2019]):    
        tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101]
        va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)]
        feature_columns = [c for c in tr_df.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']]
        # train / validation / test
        tr_x = tr_df[feature_columns]
        tr_y = tr_df[tcol]/4
        va_x = va_df[feature_columns]
        va_y = va_df[tcol]/4
        test_x = test_tmp[feature_columns]
        # training
        model = ModelLgb(plot=plot_mode)
        model.fit(tr_x, tr_y, va_x, va_y)
        # predict
        test_pred = model.predict(test_x).reshape(-1)
        test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing
        test_preds.append(test_pred)
    # preds
    preds = np.mean(test_preds, axis=0)
    # save per target
    results_sub1[tcol] = preds

sub[target_columns] = pd.DataFrame(results_sub1).round()*4
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub1.csv"), index=False)

# Sub2(0のみ)
sub[target_columns] = 0
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub2.csv"), index=False)

# Sub3(4のみ)
sub[target_columns] = 4
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub3.csv"), index=False)

# Sub4(8のみ)
sub[target_columns] = 8
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub4.csv"), index=False)
  0%|          | 0/3 [00:00<?, ?it/s]
datetime pollen_utsunomiya pollen_chiba pollen_tokyo
0 2020040101 84.0 32.0 32.0
1 2020040102 80.0 36.0 32.0
2 2020040103 80.0 36.0 32.0
datetime pollen_utsunomiya pollen_chiba pollen_tokyo
0 2020040101 0 0 0
1 2020040102 0 0 0
2 2020040103 0 0 0
datetime pollen_utsunomiya pollen_chiba pollen_tokyo
0 2020040101 4 4 4
1 2020040102 4 4 4
2 2020040103 4 4 4
datetime pollen_utsunomiya pollen_chiba pollen_tokyo
0 2020040101 8 8 8
1 2020040102 8 8 8
2 2020040103 8 8 8

各ファイルのPublic LBスコアは以下のようになった

  • Sub1(簡易の予測) LB:49.44279
  • Sub2(0のみ) LB:14.75622
  • Sub3(4のみ) LB:13.86070
  • Sub4(8のみ) LB:14.47761
    この結果からも分かるように2020年の結果は過去の分布と異なり、小さい値の範囲に分布している可能性が高い
    決定木で予測する場合は過去の分布をそのまま参考に学習する方法ではうまくいかないため、別の方法を模索してみる

補正値を設定

過去のデータからも学習ができるように補正して学習する方法を取ってみる
2017-2019から各4月前半を予測する際の補正値(4刻み)を変えていく -> 2020の予測から閾値を推定

print('2-3月\n', train_df[(train_tmp['year']!=2020)&(train_df['month'].isin([2,3]))].describe()[target_columns])
print('4月前半\n', train_df[(train_tmp['month']==4)&(train_tmp['day']<15)].describe()[target_columns])
2-3月
        pollen_utsunomiya  pollen_chiba  pollen_tokyo
count        4185.000000   4185.000000   4185.000000
mean          170.580167     53.825806     44.210275
std           542.361827    157.080851    110.309489
min             0.000000      0.000000      0.000000
25%             8.000000      4.000000      4.000000
50%            28.000000     12.000000     12.000000
75%           122.000000     45.000000     36.000000
max         12193.000000   4141.000000   2209.000000
4月前半
        pollen_utsunomiya  pollen_chiba  pollen_tokyo
count        1008.000000   1008.000000   1008.000000
mean          152.183532     46.733135     46.874008
std           341.681206    104.334555     88.650295
min             0.000000      0.000000      0.000000
25%            20.000000      4.000000      4.000000
50%            57.000000     16.000000     16.000000
75%           151.000000     45.000000     49.000000
max          5629.000000   2119.000000    746.000000
# run
target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']

q_range = np.arange(4, 40, 4).round()
scores = {'pollen_utsunomiya':[], 'pollen_chiba':[], 'pollen_tokyo':[]}
for q in tqdm(q_range):
    score = []
    for tcol in target_columns:
        set_seed(seed)
        train_tmp = train_df.copy()

        feature_columns = [c for c in train_tmp.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']]

        test_preds = []
        losses = []
        for i, year in enumerate([2017, 2018, 2019]):
            tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101]
            va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)]
            te_df = train_tmp[(train_tmp['year']==2020)]

            # 補正
            qth = q   
            tr_df = tr_df[tr_df[tcol] <= qth].reset_index(drop=True)
            va_df = va_df[va_df[tcol] <= qth].reset_index(drop=True)

            # train / validation / test
            tr_x = tr_df[feature_columns]
            tr_y = tr_df[tcol]
            va_x = va_df[feature_columns]
            va_y = va_df[tcol]
            test_x = te_df[feature_columns]
            test_y = te_df[tcol]

            # training
            model = ModelLgb(plot=plot_mode)
            model.fit(tr_x, tr_y, va_x, va_y)
            # predict
            test_pred = model.predict(test_x)
            test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing
            test_preds.append(test_pred)
            # loss
            test_loss = mae(test_y, test_pred)
            losses.append(test_loss)
        # mean loss
        mean_loss = np.mean(losses)
        scores[tcol].append(mean_loss)

# plot
df_qth = pd.DataFrame(scores).set_index(q_range)
display(df_qth.style.highlight_min())
plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
    plt.subplot(1, ncols, i+1)
    plt.plot(df_qth.index, df_qth[col], alpha=1, color=color[i], label=col)
    plt.xlabel(col)
    plt.legend()
    plt.grid()
plt.show()
  0%|          | 0/9 [00:00<?, ?it/s]
  pollen_utsunomiya pollen_chiba pollen_tokyo
4 25.270670 16.169495 12.431046
8 24.584037 15.180790 11.975406
12 23.421559 14.710188 11.983310
16 23.221837 14.546360 12.066856
20 22.754685 14.516001 12.376283
24 22.371299 14.574958 12.675389
28 22.328795 14.691928 13.030755
32 22.505439 14.739105 13.535812
36 22.616658 14.950690 13.818950
# LBも考慮して設定
vq = {'pollen_utsunomiya':20, 'pollen_chiba':20, 'pollen_tokyo':8}

モデルの学習

testと同じ期間を時系列順に検証データとする

# run
target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
plot_mode = True
results = dict()
score = []

set_seed(seed)

for tcol in tqdm(target_columns):
    print('='*10+tcol+'='*10)

    train_tmp = train_df.copy()
    test_tmp = test_df.copy()

    va_preds = []
    test_preds = []
    losses = []
    for i, year in enumerate([2017, 2018, 2019]):
        print(f'<year : {year}>')    
        tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101]
        va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)]

        feature_columns = [c for c in tr_df.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']]

        # 補正
        qth = vq[tcol]
        tr_df = tr_df[tr_df[tcol] <= qth].reset_index(drop=True)
        va_df = va_df[va_df[tcol] <= qth].reset_index(drop=True)

        # train / validation / test
        tr_x = tr_df[feature_columns]
        tr_y = tr_df[tcol]/4
        va_x = va_df[feature_columns]
        va_y = va_df[tcol]/4
        test_x = test_tmp[feature_columns]

        # training
        model = ModelLgb(plot=plot_mode)
        model.fit(tr_x, tr_y, va_x, va_y)

        # valid / test predict
        va_pred = model.predict(va_x).reshape(-1)
        va_pred = np.where(va_pred < 0, 0, va_pred) # post-processing
        va_preds.append(va_pred)
        test_pred = model.predict(test_x).reshape(-1)
        test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing
        test_preds.append(test_pred)

        # valid loss
        va_loss = mae(va_y.values, va_pred)
        print(f'LOSS : {va_loss}')
        losses.append(va_loss)

        # plot valid / pred
        if plot_mode:
            plt.figure(figsize=(10,5))
            plt.plot(va_y.values, label='original', linestyle='-')
            plt.plot(va_pred, label='pred', linestyle='-')
            plt.title(f'{tcol} : {va_loss}')
            plt.legend()
            plt.show()
    
    # preds
    preds = np.mean(test_preds, axis=0)
    
    # mean loss
    mean_loss = np.mean(losses)
    print(f'Mean LOSS : {mean_loss}\n')

    # save per target
    results[tcol] = preds
    score.append(mean_loss)

# score
print(f'Score : {np.array(score).mean()}') 

# Score : 1.11005112558556
  0%|          | 0/3 [00:00<?, ?it/s]
==========pollen_utsunomiya==========
<year : 2017>
LOSS : 1.1938153630467006
<year : 2018>
LOSS : 1.4568659271147195
<year : 2019>
LOSS : 1.5079497032585705
Mean LOSS : 1.3862103311399967

==========pollen_chiba==========
<year : 2017>
LOSS : 1.0321580800023542
<year : 2018>
LOSS : 1.3894891307069823
<year : 2019>
LOSS : 1.3890817102532345
Mean LOSS : 1.2702429736541903

==========pollen_tokyo==========
<year : 2017>