kotrying
以下のように構成します
MyDrive
├<pollen_counts>
├<notebook>
│ └run.ipynb
├<input>
│ ├train.csv
│ ├submission.csv
│ └test.csv
└<output>
# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.simplefilter('ignore')
# mount
from google.colab import drive
if not os.path.isdir('/content/drive'):
drive.mount('/content/drive')
Mounted at /content/drive
# Config
DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/pollen_counts"
INPUT = os.path.join(DRIVE_PATH, "input")
OUTPUT = os.path.join(DRIVE_PATH, "output")
TRAIN_FILE = os.path.join(INPUT, "train.csv")
TEST_FILE = os.path.join(INPUT, "test.csv")
SUB_FILE = os.path.join(INPUT, "submission.csv")
exp_name = 'exp000'
seed = 42
# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'
# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)
以下はEDAでの処理をベースとする
https://comp.probspace.com/competitions/pollen_counts/discussions/kotrying-Post50e89902d5f42593d900
# object(欠測) -> float
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
train_df = train.replace('欠測', np.nan)
lgb_imp = IterativeImputer(
estimator=LGBMRegressor(num_boost_round=1000, random_state=seed),
max_iter=10,
initial_strategy='mean',
imputation_order='ascending',
verbose=1,
random_state=seed)
train_df = pd.DataFrame(lgb_imp.fit_transform(train_df), columns=train_df.columns)
train_df[['winddirection_chiba', 'winddirection_tokyo']] = train_df[['winddirection_chiba', 'winddirection_tokyo']].round().astype(int)
train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']] = train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']].round(1)
train[train.select_dtypes(object).columns] = train_df[train.select_dtypes(object).columns]
train.head(3)
[IterativeImputer] Completing matrix with shape (12168, 16) [IterativeImputer] Change: 8.828120105598833, scaled tolerance: 2020033.124 [IterativeImputer] Early stopping criterion reached.
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017020101 | 0.0 | 0.0 | 0.0 | -1.0 | 4.1 | 2.9 | 16 | 1 | 2 | 2.7 | 2.5 | 1.3 | 0.0 | 8.0 | 0.0 |
1 | 2017020102 | 0.0 | 0.0 | 0.0 | -1.1 | 4.2 | 2.6 | 1 | 1 | 1 | 3.3 | 1.5 | 0.9 | 0.0 | 24.0 | 4.0 |
2 | 2017020103 | 0.0 | 0.0 | 0.0 | -0.7 | 4.2 | 2.4 | 1 | 15 | 16 | 4.0 | 1.7 | 0.6 | 4.0 | 32.0 | 12.0 |
# 基本時間特徴
def add_time_feat(df):
df['time'] = pd.to_datetime(df.datetime.astype(str).str[:-2])
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df.datetime.astype(str).str[-2:].astype(int)
df['weekday'] = df['time'].dt.weekday
df['day_of_year'] = df['time'].dt.dayofyear
df['day_of_year'] = df.apply(lambda x: x['day_of_year']-1 if (x['time'] > pd.Timestamp('2020-02-29')) else x['day_of_year'], axis=1)
df['day_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365))
df['day_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365))
return df
# ラグ特徴
def add_lag_feat(df, feat:list, group:str):
outputs = [df]
grp_df = df.groupby(group)
for lag in [1, 2, 3, 4, 5]:
# shift
outputs.append(grp_df[feat].shift(lag).add_prefix(f'shift{lag}_'))
# diff
outputs.append(grp_df[feat].diff(lag).add_prefix(f'diff{lag}_'))
# rolling
for window in [3,24]:
tmp_df = grp_df[feat].rolling(window, min_periods=1)
tmp_df = tmp_df.mean().add_prefix(f'rolling{window}_mean_')
outputs.append(tmp_df.reset_index(drop=True))
return pd.concat(outputs, axis=1)
# 集計特徴
def additional_encoding(train, test, cat_col:list, num_col:list):
trdf = train.copy()
tedf = test.copy()
# Count Encoding
for ccol in cat_col:
encoder = trdf[(trdf['month']==4)&(trdf['day']<15)][ccol].value_counts()
trdf[f'ce_{ccol}'] = trdf[ccol].map(encoder)
tedf[f'ce_{ccol}'] = tedf[ccol].map(encoder)
# Add Aggregate Features
agg_cols = ['mean', 'std', 'min', 'max']
for ccol in cat_col:
for ncol in num_col:
agg_df = trdf.groupby(ccol)[ncol].agg(agg_cols)
agg_df['abs_mean'] = np.abs(agg_df['mean'])
agg_df['min_max'] = agg_df['min']*agg_df['max']
agg_df.columns = [f'{ccol}_{c}' for c in agg_df.columns]
trdf = trdf.merge(agg_df, on=ccol, how='left')
tedf = tedf.merge(agg_df, on=ccol, how='left')
return trdf, tedf
def run_add_feat(train, test):
# 連結して全データに対して処理
df = pd.concat([train, test]).reset_index(drop=True)
# 基本時間特徴の追加
df = add_time_feat(df)
# 手動特徴の追加
windd_col = ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
winds_col = ['windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
for d, s in zip(windd_col, winds_col):
df[f'{d}_{s}'] = np.sin(df[windd_col] * (2 * np.pi / 17))[d] * df[s]
precipitation_col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo']
# ラグ特徴の追加
feat = [
'precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo',
'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo',
'windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo',
'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo'
]
df = add_lag_feat(df, feat, 'year')
# train/testに再分割、欠損処理
train_df = df[:len(train)]
test_df = df[len(train):]
train_df = train_df.dropna().reset_index(drop=True)
# 集計特徴の追加
cat_columns = ['year', 'month', 'day', 'hour', 'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
num_columns = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo',
'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo',
'windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
train_df, test_df = additional_encoding(train_df, test_df, cat_columns, num_columns)
return train_df, test_df
train_df, test_df = run_add_feat(train, test)
print(train_df.shape)
display(train_df.head(3))
print(test_df.shape)
display(test_df.head(3))
(12148, 557)
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | time | year | month | day | hour | weekday | day_of_year | day_sin | day_cos | winddirection_utsunomiya_windspeed_utsunomiya | winddirection_chiba_windspeed_chiba | winddirection_tokyo_windspeed_tokyo | shift1_precipitation_utsunomiya | shift1_precipitation_chiba | shift1_precipitation_tokyo | shift1_temperature_utsunomiya | shift1_temperature_chiba | shift1_temperature_tokyo | shift1_windspeed_utsunomiya | shift1_windspeed_chiba | shift1_windspeed_tokyo | shift1_winddirection_utsunomiya | shift1_winddirection_chiba | shift1_winddirection_tokyo | diff1_precipitation_utsunomiya | diff1_precipitation_chiba | diff1_precipitation_tokyo | diff1_temperature_utsunomiya | diff1_temperature_chiba | diff1_temperature_tokyo | diff1_windspeed_utsunomiya | diff1_windspeed_chiba | diff1_windspeed_tokyo | diff1_winddirection_utsunomiya | diff1_winddirection_chiba | diff1_winddirection_tokyo | shift2_precipitation_utsunomiya | shift2_precipitation_chiba | shift2_precipitation_tokyo | shift2_temperature_utsunomiya | shift2_temperature_chiba | shift2_temperature_tokyo | shift2_windspeed_utsunomiya | shift2_windspeed_chiba | shift2_windspeed_tokyo | shift2_winddirection_utsunomiya | shift2_winddirection_chiba | shift2_winddirection_tokyo | diff2_precipitation_utsunomiya | diff2_precipitation_chiba | diff2_precipitation_tokyo | diff2_temperature_utsunomiya | diff2_temperature_chiba | diff2_temperature_tokyo | diff2_windspeed_utsunomiya | diff2_windspeed_chiba | diff2_windspeed_tokyo | diff2_winddirection_utsunomiya | diff2_winddirection_chiba | diff2_winddirection_tokyo | shift3_precipitation_utsunomiya | shift3_precipitation_chiba | shift3_precipitation_tokyo | shift3_temperature_utsunomiya | shift3_temperature_chiba | shift3_temperature_tokyo | shift3_windspeed_utsunomiya | shift3_windspeed_chiba | shift3_windspeed_tokyo | shift3_winddirection_utsunomiya | shift3_winddirection_chiba | shift3_winddirection_tokyo | diff3_precipitation_utsunomiya | diff3_precipitation_chiba | diff3_precipitation_tokyo | diff3_temperature_utsunomiya | diff3_temperature_chiba | diff3_temperature_tokyo | diff3_windspeed_utsunomiya | diff3_windspeed_chiba | diff3_windspeed_tokyo | diff3_winddirection_utsunomiya | diff3_winddirection_chiba | diff3_winddirection_tokyo | ... | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean_x | winddirection_chiba_std_x | winddirection_chiba_min_x | winddirection_chiba_max_x | winddirection_chiba_abs_mean_x | winddirection_chiba_min_max_x | winddirection_chiba_mean_y | winddirection_chiba_std_y | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean_x | winddirection_chiba_std_x | winddirection_chiba_min_x | winddirection_chiba_max_x | winddirection_chiba_abs_mean_x | winddirection_chiba_min_max_x | winddirection_chiba_mean_y | winddirection_chiba_std_y | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean_x | winddirection_chiba_std_x | winddirection_chiba_min_x | winddirection_chiba_max_x | winddirection_chiba_abs_mean_x | winddirection_chiba_min_max_x | winddirection_chiba_mean_y | winddirection_chiba_std_y | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean | winddirection_chiba_std | winddirection_chiba_min | winddirection_chiba_max | winddirection_chiba_abs_mean | winddirection_chiba_min_max | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean | winddirection_tokyo_std | winddirection_tokyo_min | winddirection_tokyo_max | winddirection_tokyo_abs_mean | winddirection_tokyo_min_max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017020106 | 0.0 | 0.0 | 0.0 | -1.8 | 4.0 | 1.1 | 1 | 15 | 15 | 2.6 | 2.3 | 1.0 | 4.0 | 4.0 | 0.0 | 2017-02-01 | 2017 | 2 | 1 | 6 | 2 | 32 | 0.523416 | 0.852078 | 0.939228 | -1.549500 | -0.673696 | 0.0 | 0.0 | 0.0 | -1.2 | 4.1 | 1.5 | 3.7 | 3.4 | 0.9 | 2.0 | 14.0 | 14.0 | 0.0 | 0.0 | 0.0 | -0.6 | -0.1 | -0.4 | -1.1 | -1.1 | 0.1 | -1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | -1.1 | 4.4 | 1.8 | 4.1 | 3.1 | 1.4 | 1.0 | 15.0 | 1.0 | 0.0 | 0.0 | 0.0 | -0.7 | -0.4 | -0.7 | -1.5 | -0.8 | -0.4 | 0.0 | 0.0 | 14.0 | 0.0 | 0.0 | 0.0 | -0.7 | 4.2 | 2.4 | 4.0 | 1.7 | 0.6 | 1.0 | 15.0 | 16.0 | 0.0 | 0.0 | 0.0 | -1.1 | -0.2 | -1.3 | -1.4 | 0.6 | 0.4 | 0.0 | 0.0 | -1.0 | ... | 0.0 | 15.0 | 0.205139 | 0.0 | 0.152906 | 0.636507 | 0.0 | 8.5 | 0.152906 | 0.0 | 6.382730 | 6.217927 | -6.5 | 28.3 | 6.382730 | -183.95 | 9.277759 | 5.565696 | -0.9 | 28.6 | 9.277759 | -25.74 | 7.740775 | 5.979493 | -4.0 | 29.6 | 7.740775 | -118.40 | 3.165965 | 2.144589 | 0.1 | 12.9 | 3.165965 | 1.29 | 3.415754 | 1.668560 | 0.3 | 11.4 | 3.415754 | 3.42 | 1.376327 | 0.747432 | 0.0 | 4.1 | 1.376327 | 0.0 | 0.234670 | 0.812608 | 0.0 | 9.0 | 0.234670 | 0.0 | 0.369693 | 1.175232 | 0.0 | 13.5 | 0.369693 | 0.0 | 0.375590 | 1.554642 | 0.0 | 21.5 | 0.375590 | 0.0 | 8.883962 | 6.598716 | -3.8 | 27.1 | 8.883962 | -102.98 | 11.517571 | 6.004159 | -0.1 | 28.2 | 11.517571 | -2.82 | 10.191627 | 6.179070 | -2.3 | 29.0 | 10.191627 | -66.70 | 3.314151 | 2.200207 | 0.1 | 11.8 | 3.314151 | 1.18 | 3.909434 | 2.321321 | 0.2 | 15.3 | 3.909434 | 3.06 | 1.384552 | 0.718821 | 0.3 | 4.5 | 1.384552 | 1.35 |
1 | 2017020107 | 0.0 | 0.0 | 0.0 | -2.1 | 3.7 | 0.7 | 16 | 15 | 14 | 2.9 | 1.8 | 1.3 | 0.0 | 12.0 | 0.0 | 2017-02-01 | 2017 | 2 | 1 | 7 | 2 | 32 | 0.523416 | 0.852078 | -1.047601 | -1.212652 | -1.163712 | 0.0 | 0.0 | 0.0 | -1.8 | 4.0 | 1.1 | 2.6 | 2.3 | 1.0 | 1.0 | 15.0 | 15.0 | 0.0 | 0.0 | 0.0 | -0.3 | -0.3 | -0.4 | 0.3 | -0.5 | 0.3 | 15.0 | 0.0 | -1.0 | 0.0 | 0.0 | 0.0 | -1.2 | 4.1 | 1.5 | 3.7 | 3.4 | 0.9 | 2.0 | 14.0 | 14.0 | 0.0 | 0.0 | 0.0 | -0.9 | -0.4 | -0.8 | -0.8 | -1.6 | 0.4 | 14.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.1 | 4.4 | 1.8 | 4.1 | 3.1 | 1.4 | 1.0 | 15.0 | 1.0 | 0.0 | 0.0 | 0.0 | -1.0 | -0.7 | -1.1 | -1.2 | -1.3 | -0.1 | 15.0 | 0.0 | 13.0 | ... | 0.0 | 15.0 | 0.205139 | 0.0 | 0.152906 | 0.636507 | 0.0 | 8.5 | 0.152906 | 0.0 | 6.382730 | 6.217927 | -6.5 | 28.3 | 6.382730 | -183.95 | 9.277759 | 5.565696 | -0.9 | 28.6 | 9.277759 | -25.74 | 7.740775 | 5.979493 | -4.0 | 29.6 | 7.740775 | -118.40 | 3.165965 | 2.144589 | 0.1 | 12.9 | 3.165965 | 1.29 | 3.415754 | 1.668560 | 0.3 | 11.4 | 3.415754 | 3.42 | 1.376327 | 0.747432 | 0.0 | 4.1 | 1.376327 | 0.0 | 0.221142 | 0.814820 | 0.0 | 11.5 | 0.221142 | 0.0 | 0.346902 | 1.430341 | 0.0 | 21.5 | 0.346902 | 0.0 | 0.351762 | 1.199612 | 0.0 | 14.0 | 0.351762 | 0.0 | 8.993439 | 6.411261 | -6.5 | 27.5 | 8.993439 | -178.75 | 11.705225 | 5.986661 | -1.4 | 28.8 | 11.705225 | -40.32 | 10.035966 | 6.177933 | -2.9 | 29.7 | 10.035966 | -86.13 | 3.147388 | 2.141114 | 0.1 | 13.9 | 3.147388 | 1.39 | 3.515796 | 2.327821 | 0.2 | 14.4 | 3.515796 | 2.88 | 1.365249 | 0.805476 | 0.3 | 5.1 | 1.365249 | 1.53 |
2 | 2017020108 | 0.0 | 0.0 | 0.0 | -0.2 | 3.9 | 2.0 | 16 | 14 | 14 | 2.2 | 2.3 | 1.0 | 8.0 | 16.0 | 8.0 | 2017-02-01 | 2017 | 2 | 1 | 8 | 2 | 32 | 0.523416 | 0.852078 | -0.794732 | -2.058876 | -0.895163 | 0.0 | 0.0 | 0.0 | -2.1 | 3.7 | 0.7 | 2.9 | 1.8 | 1.3 | 16.0 | 15.0 | 14.0 | 0.0 | 0.0 | 0.0 | 1.9 | 0.2 | 1.3 | -0.7 | 0.5 | -0.3 | 0.0 | -1.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.8 | 4.0 | 1.1 | 2.6 | 2.3 | 1.0 | 1.0 | 15.0 | 15.0 | 0.0 | 0.0 | 0.0 | 1.6 | -0.1 | 0.9 | -0.4 | 0.0 | 0.0 | 15.0 | -1.0 | -1.0 | 0.0 | 0.0 | 0.0 | -1.2 | 4.1 | 1.5 | 3.7 | 3.4 | 0.9 | 2.0 | 14.0 | 14.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.2 | 0.5 | -1.5 | -1.1 | 0.1 | 14.0 | 0.0 | 0.0 | ... | 0.0 | 7.5 | 0.050968 | 0.0 | 0.034839 | 0.251762 | 0.0 | 4.0 | 0.034839 | 0.0 | 7.310581 | 5.953827 | -4.6 | 27.8 | 7.310581 | -127.88 | 10.326323 | 5.332706 | -0.1 | 29.2 | 10.326323 | -2.92 | 8.772774 | 5.742079 | -4.3 | 30.1 | 8.772774 | -129.43 | 3.823226 | 2.716188 | 0.2 | 15.9 | 3.823226 | 3.18 | 4.774323 | 2.622473 | 0.3 | 13.2 | 4.774323 | 3.96 | 1.637032 | 0.915166 | 0.0 | 5.1 | 1.637032 | 0.0 | 0.221142 | 0.814820 | 0.0 | 11.5 | 0.221142 | 0.0 | 0.346902 | 1.430341 | 0.0 | 21.5 | 0.346902 | 0.0 | 0.351762 | 1.199612 | 0.0 | 14.0 | 0.351762 | 0.0 | 8.993439 | 6.411261 | -6.5 | 27.5 | 8.993439 | -178.75 | 11.705225 | 5.986661 | -1.4 | 28.8 | 11.705225 | -40.32 | 10.035966 | 6.177933 | -2.9 | 29.7 | 10.035966 | -86.13 | 3.147388 | 2.141114 | 0.1 | 13.9 | 3.147388 | 1.39 | 3.515796 | 2.327821 | 0.2 | 14.4 | 3.515796 | 2.88 | 1.365249 | 0.805476 | 0.3 | 5.1 | 1.365249 | 1.53 |
3 rows × 557 columns
(336, 557)
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | time | year | month | day | hour | weekday | day_of_year | day_sin | day_cos | winddirection_utsunomiya_windspeed_utsunomiya | winddirection_chiba_windspeed_chiba | winddirection_tokyo_windspeed_tokyo | shift1_precipitation_utsunomiya | shift1_precipitation_chiba | shift1_precipitation_tokyo | shift1_temperature_utsunomiya | shift1_temperature_chiba | shift1_temperature_tokyo | shift1_windspeed_utsunomiya | shift1_windspeed_chiba | shift1_windspeed_tokyo | shift1_winddirection_utsunomiya | shift1_winddirection_chiba | shift1_winddirection_tokyo | diff1_precipitation_utsunomiya | diff1_precipitation_chiba | diff1_precipitation_tokyo | diff1_temperature_utsunomiya | diff1_temperature_chiba | diff1_temperature_tokyo | diff1_windspeed_utsunomiya | diff1_windspeed_chiba | diff1_windspeed_tokyo | diff1_winddirection_utsunomiya | diff1_winddirection_chiba | diff1_winddirection_tokyo | shift2_precipitation_utsunomiya | shift2_precipitation_chiba | shift2_precipitation_tokyo | shift2_temperature_utsunomiya | shift2_temperature_chiba | shift2_temperature_tokyo | shift2_windspeed_utsunomiya | shift2_windspeed_chiba | shift2_windspeed_tokyo | shift2_winddirection_utsunomiya | shift2_winddirection_chiba | shift2_winddirection_tokyo | diff2_precipitation_utsunomiya | diff2_precipitation_chiba | diff2_precipitation_tokyo | diff2_temperature_utsunomiya | diff2_temperature_chiba | diff2_temperature_tokyo | diff2_windspeed_utsunomiya | diff2_windspeed_chiba | diff2_windspeed_tokyo | diff2_winddirection_utsunomiya | diff2_winddirection_chiba | diff2_winddirection_tokyo | shift3_precipitation_utsunomiya | shift3_precipitation_chiba | shift3_precipitation_tokyo | shift3_temperature_utsunomiya | shift3_temperature_chiba | shift3_temperature_tokyo | shift3_windspeed_utsunomiya | shift3_windspeed_chiba | shift3_windspeed_tokyo | shift3_winddirection_utsunomiya | shift3_winddirection_chiba | shift3_winddirection_tokyo | diff3_precipitation_utsunomiya | diff3_precipitation_chiba | diff3_precipitation_tokyo | diff3_temperature_utsunomiya | diff3_temperature_chiba | diff3_temperature_tokyo | diff3_windspeed_utsunomiya | diff3_windspeed_chiba | diff3_windspeed_tokyo | diff3_winddirection_utsunomiya | diff3_winddirection_chiba | diff3_winddirection_tokyo | ... | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean_x | winddirection_chiba_std_x | winddirection_chiba_min_x | winddirection_chiba_max_x | winddirection_chiba_abs_mean_x | winddirection_chiba_min_max_x | winddirection_chiba_mean_y | winddirection_chiba_std_y | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean_x | winddirection_chiba_std_x | winddirection_chiba_min_x | winddirection_chiba_max_x | winddirection_chiba_abs_mean_x | winddirection_chiba_min_max_x | winddirection_chiba_mean_y | winddirection_chiba_std_y | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean_x | winddirection_chiba_std_x | winddirection_chiba_min_x | winddirection_chiba_max_x | winddirection_chiba_abs_mean_x | winddirection_chiba_min_max_x | winddirection_chiba_mean_y | winddirection_chiba_std_y | winddirection_chiba_min_y | winddirection_chiba_max_y | winddirection_chiba_abs_mean_y | winddirection_chiba_min_max_y | winddirection_chiba_mean | winddirection_chiba_std | winddirection_chiba_min | winddirection_chiba_max | winddirection_chiba_abs_mean | winddirection_chiba_min_max | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean_x | winddirection_tokyo_std_x | winddirection_tokyo_min_x | winddirection_tokyo_max_x | winddirection_tokyo_abs_mean_x | winddirection_tokyo_min_max_x | winddirection_tokyo_mean_y | winddirection_tokyo_std_y | winddirection_tokyo_min_y | winddirection_tokyo_max_y | winddirection_tokyo_abs_mean_y | winddirection_tokyo_min_max_y | winddirection_tokyo_mean | winddirection_tokyo_std | winddirection_tokyo_min | winddirection_tokyo_max | winddirection_tokyo_abs_mean | winddirection_tokyo_min_max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020040101 | 0.0 | 0.0 | 0.0 | 9.5 | 10.5 | 9.0 | 14 | 2 | 14 | 2.1 | 2.3 | 1.2 | 0.0 | 0.0 | 0.0 | 2020-04-01 | 2020 | 4 | 1 | 1 | 2 | 91 | 0.999991 | 0.004304 | -1.879843 | 1.549500 | -1.074196 | 0.0 | 0.0 | 0.0 | 9.7 | 10.7 | 8.9 | 1.0 | 2.7 | 0.4 | 16.0 | 1.0 | 16.0 | 0.0 | 0.0 | 0.0 | -0.2 | -0.2 | 0.1 | 1.1 | -0.4 | 0.8 | -2.0 | 1.0 | -2.0 | 0.5 | 0.0 | 0.0 | 9.7 | 10.9 | 8.9 | 0.5 | 2.9 | 0.6 | 16.0 | 16.0 | 1.0 | -0.5 | 0.0 | 0.0 | -0.2 | -0.4 | 0.1 | 1.6 | -0.6 | 0.6 | -2.0 | -14.0 | 13.0 | 0.0 | 0.0 | 0.0 | 9.8 | 11.3 | 8.8 | 1.2 | 2.7 | 0.9 | 3.0 | 15.0 | 15.0 | 0.0 | 0.0 | 0.0 | -0.3 | -0.8 | 0.2 | 0.9 | -0.4 | 0.3 | 11.0 | -13.0 | -1.0 | ... | 0.0 | 11.5 | 0.209054 | 0.0 | 0.213123 | 0.830675 | 0.0 | 10.0 | 0.213123 | 0.0 | 11.105697 | 6.529016 | -6.1 | 27.5 | 11.105697 | -167.75 | 12.908850 | 5.756066 | -1.4 | 27.5 | 12.908850 | -38.5 | 12.245575 | 6.104280 | -3.0 | 28.4 | 12.245575 | -85.2 | 2.683316 | 1.556318 | 0.0 | 11.2 | 2.683316 | 0.00 | 3.287080 | 1.379972 | 0.3 | 8.2 | 3.287080 | 2.46 | 1.478332 | 0.817158 | 0.0 | 4.7 | 1.478332 | 0.0 | 0.221142 | 0.81482 | 0.0 | 11.5 | 0.221142 | 0.0 | 0.346902 | 1.430341 | 0.0 | 21.5 | 0.346902 | 0.0 | 0.351762 | 1.199612 | 0.0 | 14.0 | 0.351762 | 0.0 | 8.993439 | 6.411261 | -6.5 | 27.5 | 8.993439 | -178.75 | 11.705225 | 5.986661 | -1.4 | 28.8 | 11.705225 | -40.32 | 10.035966 | 6.177933 | -2.9 | 29.7 | 10.035966 | -86.13 | 3.147388 | 2.141114 | 0.1 | 13.9 | 3.147388 | 1.39 | 3.515796 | 2.327821 | 0.2 | 14.4 | 3.515796 | 2.88 | 1.365249 | 0.805476 | 0.3 | 5.1 | 1.365249 | 1.53 |
1 | 2020040102 | 0.0 | 0.0 | 0.0 | 9.2 | 10.3 | 9.0 | 2 | 16 | 14 | 1.4 | 2.7 | 0.8 | 0.0 | 0.0 | 0.0 | 2020-04-01 | 2020 | 4 | 1 | 2 | 2 | 91 | 0.999991 | 0.004304 | 0.943174 | -0.975352 | -0.716131 | 0.0 | 0.0 | 0.0 | 9.5 | 10.5 | 9.0 | 2.1 | 2.3 | 1.2 | 14.0 | 2.0 | 14.0 | 0.0 | 0.0 | 0.0 | -0.3 | -0.2 | 0.0 | -0.7 | 0.4 | -0.4 | -12.0 | 14.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.7 | 10.7 | 8.9 | 1.0 | 2.7 | 0.4 | 16.0 | 1.0 | 16.0 | 0.0 | 0.0 | 0.0 | -0.5 | -0.4 | 0.1 | 0.4 | 0.0 | 0.4 | -14.0 | 15.0 | -2.0 | 0.5 | 0.0 | 0.0 | 9.7 | 10.9 | 8.9 | 0.5 | 2.9 | 0.6 | 16.0 | 16.0 | 1.0 | -0.5 | 0.0 | 0.0 | -0.5 | -0.6 | 0.1 | 0.9 | -0.2 | 0.2 | -14.0 | 0.0 | 13.0 | ... | 0.0 | 9.5 | 0.292402 | 0.0 | 0.248982 | 0.832885 | 0.0 | 6.5 | 0.248982 | 0.0 | 7.495387 | 6.633393 | -4.7 | 27.5 | 7.495387 | -129.25 | 9.779376 | 5.937938 | -0.2 | 29.5 | 9.779376 | -5.9 | 8.547490 | 6.377572 | -4.0 | 30.6 | 8.547490 | -122.4 | 3.129715 | 1.955545 | 0.2 | 11.6 | 3.129715 | 2.32 | 2.229851 | 1.017832 | 0.3 | 7.1 | 2.229851 | 2.13 | 1.297015 | 0.680167 | 0.0 | 4.2 | 1.297015 | 0.0 | 0.221142 | 0.81482 | 0.0 | 11.5 | 0.221142 | 0.0 | 0.346902 | 1.430341 | 0.0 | 21.5 | 0.346902 | 0.0 | 0.351762 | 1.199612 | 0.0 | 14.0 | 0.351762 | 0.0 | 8.993439 | 6.411261 | -6.5 | 27.5 | 8.993439 | -178.75 | 11.705225 | 5.986661 | -1.4 | 28.8 | 11.705225 | -40.32 | 10.035966 | 6.177933 | -2.9 | 29.7 | 10.035966 | -86.13 | 3.147388 | 2.141114 | 0.1 | 13.9 | 3.147388 | 1.39 | 3.515796 | 2.327821 | 0.2 | 14.4 | 3.515796 | 2.88 | 1.365249 | 0.805476 | 0.3 | 5.1 | 1.365249 | 1.53 |
2 | 2020040103 | 0.0 | 0.0 | 0.0 | 9.2 | 10.2 | 9.1 | 16 | 16 | 12 | 3.3 | 2.5 | 0.5 | 0.0 | 0.0 | 0.0 | 2020-04-01 | 2020 | 4 | 1 | 3 | 2 | 91 | 0.999991 | 0.004304 | -1.192097 | -0.903104 | -0.480913 | 0.0 | 0.0 | 0.0 | 9.2 | 10.3 | 9.0 | 1.4 | 2.7 | 0.8 | 2.0 | 16.0 | 14.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.1 | 0.1 | 1.9 | -0.2 | -0.3 | 14.0 | 0.0 | -2.0 | 0.0 | 0.0 | 0.0 | 9.5 | 10.5 | 9.0 | 2.1 | 2.3 | 1.2 | 14.0 | 2.0 | 14.0 | 0.0 | 0.0 | 0.0 | -0.3 | -0.3 | 0.1 | 1.2 | 0.2 | -0.7 | 2.0 | 14.0 | -2.0 | 0.0 | 0.0 | 0.0 | 9.7 | 10.7 | 8.9 | 1.0 | 2.7 | 0.4 | 16.0 | 1.0 | 16.0 | 0.0 | 0.0 | 0.0 | -0.5 | -0.5 | 0.2 | 2.3 | -0.2 | 0.1 | 0.0 | 15.0 | -4.0 | ... | 0.0 | 9.5 | 0.292402 | 0.0 | 0.248982 | 0.832885 | 0.0 | 6.5 | 0.248982 | 0.0 | 7.495387 | 6.633393 | -4.7 | 27.5 | 7.495387 | -129.25 | 9.779376 | 5.937938 | -0.2 | 29.5 | 9.779376 | -5.9 | 8.547490 | 6.377572 | -4.0 | 30.6 | 8.547490 | -122.4 | 3.129715 | 1.955545 | 0.2 | 11.6 | 3.129715 | 2.32 | 2.229851 | 1.017832 | 0.3 | 7.1 | 2.229851 | 2.13 | 1.297015 | 0.680167 | 0.0 | 4.2 | 1.297015 | 0.0 | 0.031306 | 0.41737 | 0.0 | 9.0 | 0.031306 | 0.0 | 0.039356 | 0.504699 | 0.0 | 11.0 | 0.039356 | 0.0 | 0.075134 | 1.035953 | 0.0 | 23.5 | 0.075134 | 0.0 | 8.798748 | 7.142278 | -5.0 | 28.8 | 8.798748 | -144.00 | 12.214311 | 6.118142 | 0.0 | 27.1 | 12.214311 | 0.00 | 9.931664 | 6.817144 | -4.0 | 31.2 | 9.931664 | -124.80 | 2.359034 | 1.559301 | 0.1 | 11.8 | 2.359034 | 1.18 | 2.782826 | 2.078830 | 0.2 | 14.4 | 2.782826 | 2.88 | 1.005546 | 0.538471 | 0.3 | 4.8 | 1.005546 | 1.44 |
3 rows × 557 columns
from sklearn.metrics import mean_absolute_error as mae
import lightgbm as lgb
import os
import random
import tensorflow as tf
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
# param
seed=42
plot_mode=False
def set_seed(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
# LightGBM
class ModelLgb:
def __init__(self, plot: bool):
self.model = None
self.plot = plot
def fit(self, tr_x, tr_y, va_x=None, va_y=None):
params = {
'objective':'regression',
'boosting':'gbdt',
'metric':'mae',
'seed': seed,
'verbosity':-1,
'learning_rate':0.1,
}
num_round = 10000
early_stopping_rounds=50
# validation
if va_x is not None:
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)
self.model = lgb.train(params, lgb_train, valid_sets=lgb_eval, num_boost_round=num_round, verbose_eval=0,
callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False)]
)
else:
# No validation
lgb_train = lgb.Dataset(tr_x, tr_y)
self.model = lgb.train(params, lgb_train, num_boost_round=100, verbose_eval=0)
# plot feature importance
if self.plot:
f_importance = np.array(self.model.feature_importance())
df_importance = pd.DataFrame({'feat':tr_x.columns, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=True)
plt.figure(figsize=(8,12))
plt.barh('feat', 'importance', data=df_importance.iloc[-30:])
plt.show()
def predict(self, x):
pred = self.model.predict(x, num_iteration=self.model.best_iteration)
return pred
plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
plt.grid()
for i, col in enumerate(plot_col):
plt.subplot(1, ncols, i+1)
train_df['pollen_chiba'].hist(range=(0,50), bins=50, alpha=1, color=color[i], label=col)
plt.legend()
plt.show()
Targetの値はどれも4の倍数になっているため、予測後の後処理で4の倍数に揃えるとよさそう
単純な予測と4の倍数のみの提出スコアを比較
target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
# Sub1(簡易の予測)
results_sub1 = dict()
set_seed(seed)
for tcol in tqdm(target_columns):
train_tmp = train_df.copy()
test_tmp = test_df.copy()
va_preds = []
test_preds = []
losses = []
for i, year in enumerate([2017, 2018, 2019]):
tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101]
va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)]
feature_columns = [c for c in tr_df.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']]
# train / validation / test
tr_x = tr_df[feature_columns]
tr_y = tr_df[tcol]/4
va_x = va_df[feature_columns]
va_y = va_df[tcol]/4
test_x = test_tmp[feature_columns]
# training
model = ModelLgb(plot=plot_mode)
model.fit(tr_x, tr_y, va_x, va_y)
# predict
test_pred = model.predict(test_x).reshape(-1)
test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing
test_preds.append(test_pred)
# preds
preds = np.mean(test_preds, axis=0)
# save per target
results_sub1[tcol] = preds
sub[target_columns] = pd.DataFrame(results_sub1).round()*4
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub1.csv"), index=False)
# Sub2(0のみ)
sub[target_columns] = 0
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub2.csv"), index=False)
# Sub3(4のみ)
sub[target_columns] = 4
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub3.csv"), index=False)
# Sub4(8のみ)
sub[target_columns] = 8
display(sub.head(3))
sub.to_csv(os.path.join(OUTPUT, "sub4.csv"), index=False)
0%| | 0/3 [00:00<?, ?it/s]
datetime | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|
0 | 2020040101 | 84.0 | 32.0 | 32.0 |
1 | 2020040102 | 80.0 | 36.0 | 32.0 |
2 | 2020040103 | 80.0 | 36.0 | 32.0 |
datetime | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|
0 | 2020040101 | 0 | 0 | 0 |
1 | 2020040102 | 0 | 0 | 0 |
2 | 2020040103 | 0 | 0 | 0 |
datetime | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|
0 | 2020040101 | 4 | 4 | 4 |
1 | 2020040102 | 4 | 4 | 4 |
2 | 2020040103 | 4 | 4 | 4 |
datetime | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|
0 | 2020040101 | 8 | 8 | 8 |
1 | 2020040102 | 8 | 8 | 8 |
2 | 2020040103 | 8 | 8 | 8 |
各ファイルのPublic LBスコアは以下のようになった
過去のデータからも学習ができるように補正して学習する方法を取ってみる
2017-2019から各4月前半を予測する際の補正値(4刻み)を変えていく -> 2020の予測から閾値を推定
print('2-3月\n', train_df[(train_tmp['year']!=2020)&(train_df['month'].isin([2,3]))].describe()[target_columns])
print('4月前半\n', train_df[(train_tmp['month']==4)&(train_tmp['day']<15)].describe()[target_columns])
2-3月 pollen_utsunomiya pollen_chiba pollen_tokyo count 4185.000000 4185.000000 4185.000000 mean 170.580167 53.825806 44.210275 std 542.361827 157.080851 110.309489 min 0.000000 0.000000 0.000000 25% 8.000000 4.000000 4.000000 50% 28.000000 12.000000 12.000000 75% 122.000000 45.000000 36.000000 max 12193.000000 4141.000000 2209.000000 4月前半 pollen_utsunomiya pollen_chiba pollen_tokyo count 1008.000000 1008.000000 1008.000000 mean 152.183532 46.733135 46.874008 std 341.681206 104.334555 88.650295 min 0.000000 0.000000 0.000000 25% 20.000000 4.000000 4.000000 50% 57.000000 16.000000 16.000000 75% 151.000000 45.000000 49.000000 max 5629.000000 2119.000000 746.000000
# run
target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
q_range = np.arange(4, 40, 4).round()
scores = {'pollen_utsunomiya':[], 'pollen_chiba':[], 'pollen_tokyo':[]}
for q in tqdm(q_range):
score = []
for tcol in target_columns:
set_seed(seed)
train_tmp = train_df.copy()
feature_columns = [c for c in train_tmp.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']]
test_preds = []
losses = []
for i, year in enumerate([2017, 2018, 2019]):
tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101]
va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)]
te_df = train_tmp[(train_tmp['year']==2020)]
# 補正
qth = q
tr_df = tr_df[tr_df[tcol] <= qth].reset_index(drop=True)
va_df = va_df[va_df[tcol] <= qth].reset_index(drop=True)
# train / validation / test
tr_x = tr_df[feature_columns]
tr_y = tr_df[tcol]
va_x = va_df[feature_columns]
va_y = va_df[tcol]
test_x = te_df[feature_columns]
test_y = te_df[tcol]
# training
model = ModelLgb(plot=plot_mode)
model.fit(tr_x, tr_y, va_x, va_y)
# predict
test_pred = model.predict(test_x)
test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing
test_preds.append(test_pred)
# loss
test_loss = mae(test_y, test_pred)
losses.append(test_loss)
# mean loss
mean_loss = np.mean(losses)
scores[tcol].append(mean_loss)
# plot
df_qth = pd.DataFrame(scores).set_index(q_range)
display(df_qth.style.highlight_min())
plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
plt.subplot(1, ncols, i+1)
plt.plot(df_qth.index, df_qth[col], alpha=1, color=color[i], label=col)
plt.xlabel(col)
plt.legend()
plt.grid()
plt.show()
0%| | 0/9 [00:00<?, ?it/s]
pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|
4 | 25.270670 | 16.169495 | 12.431046 |
8 | 24.584037 | 15.180790 | 11.975406 |
12 | 23.421559 | 14.710188 | 11.983310 |
16 | 23.221837 | 14.546360 | 12.066856 |
20 | 22.754685 | 14.516001 | 12.376283 |
24 | 22.371299 | 14.574958 | 12.675389 |
28 | 22.328795 | 14.691928 | 13.030755 |
32 | 22.505439 | 14.739105 | 13.535812 |
36 | 22.616658 | 14.950690 | 13.818950 |
# LBも考慮して設定
vq = {'pollen_utsunomiya':20, 'pollen_chiba':20, 'pollen_tokyo':8}
testと同じ期間を時系列順に検証データとする
# run
target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
plot_mode = True
results = dict()
score = []
set_seed(seed)
for tcol in tqdm(target_columns):
print('='*10+tcol+'='*10)
train_tmp = train_df.copy()
test_tmp = test_df.copy()
va_preds = []
test_preds = []
losses = []
for i, year in enumerate([2017, 2018, 2019]):
print(f'<year : {year}>')
tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101]
va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)]
feature_columns = [c for c in tr_df.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']]
# 補正
qth = vq[tcol]
tr_df = tr_df[tr_df[tcol] <= qth].reset_index(drop=True)
va_df = va_df[va_df[tcol] <= qth].reset_index(drop=True)
# train / validation / test
tr_x = tr_df[feature_columns]
tr_y = tr_df[tcol]/4
va_x = va_df[feature_columns]
va_y = va_df[tcol]/4
test_x = test_tmp[feature_columns]
# training
model = ModelLgb(plot=plot_mode)
model.fit(tr_x, tr_y, va_x, va_y)
# valid / test predict
va_pred = model.predict(va_x).reshape(-1)
va_pred = np.where(va_pred < 0, 0, va_pred) # post-processing
va_preds.append(va_pred)
test_pred = model.predict(test_x).reshape(-1)
test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing
test_preds.append(test_pred)
# valid loss
va_loss = mae(va_y.values, va_pred)
print(f'LOSS : {va_loss}')
losses.append(va_loss)
# plot valid / pred
if plot_mode:
plt.figure(figsize=(10,5))
plt.plot(va_y.values, label='original', linestyle='-')
plt.plot(va_pred, label='pred', linestyle='-')
plt.title(f'{tcol} : {va_loss}')
plt.legend()
plt.show()
# preds
preds = np.mean(test_preds, axis=0)
# mean loss
mean_loss = np.mean(losses)
print(f'Mean LOSS : {mean_loss}\n')
# save per target
results[tcol] = preds
score.append(mean_loss)
# score
print(f'Score : {np.array(score).mean()}')
# Score : 1.11005112558556
0%| | 0/3 [00:00<?, ?it/s]
==========pollen_utsunomiya========== <year : 2017>
LOSS : 1.1938153630467006
<year : 2018>
LOSS : 1.4568659271147195
<year : 2019>
LOSS : 1.5079497032585705
Mean LOSS : 1.3862103311399967 ==========pollen_chiba========== <year : 2017>
LOSS : 1.0321580800023542
<year : 2018>
LOSS : 1.3894891307069823
<year : 2019>
LOSS : 1.3890817102532345
Mean LOSS : 1.2702429736541903 ==========pollen_tokyo========== <year : 2017>
LOSS : 0.624015822869251
<year : 2018>
LOSS : 0.7684184340492826
<year : 2019>
LOSS : 0.6286659589689456
Mean LOSS : 0.673700071962493 Score : 1.11005112558556
results_df = pd.DataFrame(results)
results_df = results_df.round()*4 # 4の倍数になるように後処理
sub[target_columns] = results_df
display(sub)
sub.to_csv(os.path.join(OUTPUT, f"{exp_name}.csv"), index=False)
datetime | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|
0 | 2020040101 | 4.0 | 8.0 | 4.0 |
1 | 2020040102 | 8.0 | 8.0 | 4.0 |
2 | 2020040103 | 8.0 | 8.0 | 4.0 |
3 | 2020040104 | 8.0 | 8.0 | 4.0 |
4 | 2020040105 | 8.0 | 8.0 | 4.0 |
... | ... | ... | ... | ... |
331 | 2020041420 | 8.0 | 8.0 | 4.0 |
332 | 2020041421 | 8.0 | 8.0 | 4.0 |
333 | 2020041422 | 8.0 | 8.0 | 4.0 |
334 | 2020041423 | 8.0 | 8.0 | 4.0 |
335 | 2020041424 | 8.0 | 4.0 | 4.0 |
336 rows × 4 columns
train_df[(train_df['year']==2020)][target_columns].quantile(q)
での調整でq=0.6-0.7程度など)
kotrying
データが更新(v2)されたので、とりあえず動く状態にするために追加でこちらをお試しください。
異常値の対応
こちらのトピックにあるように、v2では異常値が存在し、それぞれの数値には降雪や黄砂などの意味があるようです。
https://comp.probspace.com/competitions/pollen_counts/discussions/tanuking0-Post352b748b3b6d376b837e
前処理
上記部分を削除して以下に変更
補正値
全て16に設定しました。
モデル部分
実行部分