Ridgelinezとの共同開催第二弾!花粉予報にチャレンジ!
kotrying
以下のように構成します
MyDrive ├<pollen_counts> ├<notebook> │ └run.ipynb ├<input> │ ├train.csv │ ├submission.csv │ └test.csv └<output>
# Library import os import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns from tqdm.auto import tqdm import warnings warnings.simplefilter('ignore') # mount from google.colab import drive if not os.path.isdir('/content/drive'): drive.mount('/content/drive')
Mounted at /content/drive
# Config DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/pollen_counts" INPUT = os.path.join(DRIVE_PATH, "input") OUTPUT = os.path.join(DRIVE_PATH, "output") TRAIN_FILE = os.path.join(INPUT, "train.csv") TEST_FILE = os.path.join(INPUT, "test.csv") SUB_FILE = os.path.join(INPUT, "submission.csv") exp_name = 'exp000' seed = 42 # plot style pd.set_option('display.max_columns', 200) plt.rcParams['axes.facecolor'] = 'EEFFFE'
# Data train = pd.read_csv(TRAIN_FILE) test = pd.read_csv(TEST_FILE) sub = pd.read_csv(SUB_FILE)
以下はEDAでの処理をベースとするhttps://comp.probspace.com/competitions/pollen_counts/discussions/kotrying-Post50e89902d5f42593d900
# object(欠測) -> float import lightgbm as lgb from lightgbm import LGBMRegressor from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer train_df = train.replace('欠測', np.nan) lgb_imp = IterativeImputer( estimator=LGBMRegressor(num_boost_round=1000, random_state=seed), max_iter=10, initial_strategy='mean', imputation_order='ascending', verbose=1, random_state=seed) train_df = pd.DataFrame(lgb_imp.fit_transform(train_df), columns=train_df.columns) train_df[['winddirection_chiba', 'winddirection_tokyo']] = train_df[['winddirection_chiba', 'winddirection_tokyo']].round().astype(int) train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']] = train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']].round(1) train[train.select_dtypes(object).columns] = train_df[train.select_dtypes(object).columns] train.head(3)
[IterativeImputer] Completing matrix with shape (12168, 16) [IterativeImputer] Change: 8.828120105598833, scaled tolerance: 2020033.124 [IterativeImputer] Early stopping criterion reached.
# 基本時間特徴 def add_time_feat(df): df['time'] = pd.to_datetime(df.datetime.astype(str).str[:-2]) df['year'] = df['time'].dt.year df['month'] = df['time'].dt.month df['day'] = df['time'].dt.day df['hour'] = df.datetime.astype(str).str[-2:].astype(int) df['weekday'] = df['time'].dt.weekday df['day_of_year'] = df['time'].dt.dayofyear df['day_of_year'] = df.apply(lambda x: x['day_of_year']-1 if (x['time'] > pd.Timestamp('2020-02-29')) else x['day_of_year'], axis=1) df['day_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365)) df['day_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365)) return df # ラグ特徴 def add_lag_feat(df, feat:list, group:str): outputs = [df] grp_df = df.groupby(group) for lag in [1, 2, 3, 4, 5]: # shift outputs.append(grp_df[feat].shift(lag).add_prefix(f'shift{lag}_')) # diff outputs.append(grp_df[feat].diff(lag).add_prefix(f'diff{lag}_')) # rolling for window in [3,24]: tmp_df = grp_df[feat].rolling(window, min_periods=1) tmp_df = tmp_df.mean().add_prefix(f'rolling{window}_mean_') outputs.append(tmp_df.reset_index(drop=True)) return pd.concat(outputs, axis=1) # 集計特徴 def additional_encoding(train, test, cat_col:list, num_col:list): trdf = train.copy() tedf = test.copy() # Count Encoding for ccol in cat_col: encoder = trdf[(trdf['month']==4)&(trdf['day']<15)][ccol].value_counts() trdf[f'ce_{ccol}'] = trdf[ccol].map(encoder) tedf[f'ce_{ccol}'] = tedf[ccol].map(encoder) # Add Aggregate Features agg_cols = ['mean', 'std', 'min', 'max'] for ccol in cat_col: for ncol in num_col: agg_df = trdf.groupby(ccol)[ncol].agg(agg_cols) agg_df['abs_mean'] = np.abs(agg_df['mean']) agg_df['min_max'] = agg_df['min']*agg_df['max'] agg_df.columns = [f'{ccol}_{c}' for c in agg_df.columns] trdf = trdf.merge(agg_df, on=ccol, how='left') tedf = tedf.merge(agg_df, on=ccol, how='left') return trdf, tedf
def run_add_feat(train, test): # 連結して全データに対して処理 df = pd.concat([train, test]).reset_index(drop=True) # 基本時間特徴の追加 df = add_time_feat(df) # 手動特徴の追加 windd_col = ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo'] winds_col = ['windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo'] for d, s in zip(windd_col, winds_col): df[f'{d}_{s}'] = np.sin(df[windd_col] * (2 * np.pi / 17))[d] * df[s] precipitation_col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo'] # ラグ特徴の追加 feat = [ 'precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo', 'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo', 'windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo', 'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo' ] df = add_lag_feat(df, feat, 'year') # train/testに再分割、欠損処理 train_df = df[:len(train)] test_df = df[len(train):] train_df = train_df.dropna().reset_index(drop=True) # 集計特徴の追加 cat_columns = ['year', 'month', 'day', 'hour', 'winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo'] num_columns = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo', 'temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo', 'windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo'] train_df, test_df = additional_encoding(train_df, test_df, cat_columns, num_columns) return train_df, test_df train_df, test_df = run_add_feat(train, test) print(train_df.shape) display(train_df.head(3)) print(test_df.shape) display(test_df.head(3))
(12148, 557)
3 rows × 557 columns
(336, 557)
from sklearn.metrics import mean_absolute_error as mae import lightgbm as lgb import os import random import tensorflow as tf from tqdm.notebook import tqdm import warnings warnings.filterwarnings('ignore') # param seed=42 plot_mode=False def set_seed(seed): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) tf.random.set_seed(seed)
# LightGBM class ModelLgb: def __init__(self, plot: bool): self.model = None self.plot = plot def fit(self, tr_x, tr_y, va_x=None, va_y=None): params = { 'objective':'regression', 'boosting':'gbdt', 'metric':'mae', 'seed': seed, 'verbosity':-1, 'learning_rate':0.1, } num_round = 10000 early_stopping_rounds=50 # validation if va_x is not None: lgb_train = lgb.Dataset(tr_x, tr_y) lgb_eval = lgb.Dataset(va_x, va_y) self.model = lgb.train(params, lgb_train, valid_sets=lgb_eval, num_boost_round=num_round, verbose_eval=0, callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False)] ) else: # No validation lgb_train = lgb.Dataset(tr_x, tr_y) self.model = lgb.train(params, lgb_train, num_boost_round=100, verbose_eval=0) # plot feature importance if self.plot: f_importance = np.array(self.model.feature_importance()) df_importance = pd.DataFrame({'feat':tr_x.columns, 'importance':f_importance}) df_importance = df_importance.sort_values('importance', ascending=True) plt.figure(figsize=(8,12)) plt.barh('feat', 'importance', data=df_importance.iloc[-30:]) plt.show() def predict(self, x): pred = self.model.predict(x, num_iteration=self.model.best_iteration) return pred
plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo'] color = ['red','green','blue'] ncols = len(plot_col) plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5)) plt.grid() for i, col in enumerate(plot_col): plt.subplot(1, ncols, i+1) train_df['pollen_chiba'].hist(range=(0,50), bins=50, alpha=1, color=color[i], label=col) plt.legend() plt.show()
Targetの値はどれも4の倍数になっているため、予測後の後処理で4の倍数に揃えるとよさそう
単純な予測と4の倍数のみの提出スコアを比較
target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo'] # Sub1(簡易の予測) results_sub1 = dict() set_seed(seed) for tcol in tqdm(target_columns): train_tmp = train_df.copy() test_tmp = test_df.copy() va_preds = [] test_preds = [] losses = [] for i, year in enumerate([2017, 2018, 2019]): tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101] va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)] feature_columns = [c for c in tr_df.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']] # train / validation / test tr_x = tr_df[feature_columns] tr_y = tr_df[tcol]/4 va_x = va_df[feature_columns] va_y = va_df[tcol]/4 test_x = test_tmp[feature_columns] # training model = ModelLgb(plot=plot_mode) model.fit(tr_x, tr_y, va_x, va_y) # predict test_pred = model.predict(test_x).reshape(-1) test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing test_preds.append(test_pred) # preds preds = np.mean(test_preds, axis=0) # save per target results_sub1[tcol] = preds sub[target_columns] = pd.DataFrame(results_sub1).round()*4 display(sub.head(3)) sub.to_csv(os.path.join(OUTPUT, "sub1.csv"), index=False) # Sub2(0のみ) sub[target_columns] = 0 display(sub.head(3)) sub.to_csv(os.path.join(OUTPUT, "sub2.csv"), index=False) # Sub3(4のみ) sub[target_columns] = 4 display(sub.head(3)) sub.to_csv(os.path.join(OUTPUT, "sub3.csv"), index=False) # Sub4(8のみ) sub[target_columns] = 8 display(sub.head(3)) sub.to_csv(os.path.join(OUTPUT, "sub4.csv"), index=False)
0%| | 0/3 [00:00<?, ?it/s]
各ファイルのPublic LBスコアは以下のようになった
過去のデータからも学習ができるように補正して学習する方法を取ってみる2017-2019から各4月前半を予測する際の補正値(4刻み)を変えていく -> 2020の予測から閾値を推定
print('2-3月\n', train_df[(train_tmp['year']!=2020)&(train_df['month'].isin([2,3]))].describe()[target_columns]) print('4月前半\n', train_df[(train_tmp['month']==4)&(train_tmp['day']<15)].describe()[target_columns])
2-3月 pollen_utsunomiya pollen_chiba pollen_tokyo count 4185.000000 4185.000000 4185.000000 mean 170.580167 53.825806 44.210275 std 542.361827 157.080851 110.309489 min 0.000000 0.000000 0.000000 25% 8.000000 4.000000 4.000000 50% 28.000000 12.000000 12.000000 75% 122.000000 45.000000 36.000000 max 12193.000000 4141.000000 2209.000000 4月前半 pollen_utsunomiya pollen_chiba pollen_tokyo count 1008.000000 1008.000000 1008.000000 mean 152.183532 46.733135 46.874008 std 341.681206 104.334555 88.650295 min 0.000000 0.000000 0.000000 25% 20.000000 4.000000 4.000000 50% 57.000000 16.000000 16.000000 75% 151.000000 45.000000 49.000000 max 5629.000000 2119.000000 746.000000
# run target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo'] q_range = np.arange(4, 40, 4).round() scores = {'pollen_utsunomiya':[], 'pollen_chiba':[], 'pollen_tokyo':[]} for q in tqdm(q_range): score = [] for tcol in target_columns: set_seed(seed) train_tmp = train_df.copy() feature_columns = [c for c in train_tmp.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']] test_preds = [] losses = [] for i, year in enumerate([2017, 2018, 2019]): tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101] va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)] te_df = train_tmp[(train_tmp['year']==2020)] # 補正 qth = q tr_df = tr_df[tr_df[tcol] <= qth].reset_index(drop=True) va_df = va_df[va_df[tcol] <= qth].reset_index(drop=True) # train / validation / test tr_x = tr_df[feature_columns] tr_y = tr_df[tcol] va_x = va_df[feature_columns] va_y = va_df[tcol] test_x = te_df[feature_columns] test_y = te_df[tcol] # training model = ModelLgb(plot=plot_mode) model.fit(tr_x, tr_y, va_x, va_y) # predict test_pred = model.predict(test_x) test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing test_preds.append(test_pred) # loss test_loss = mae(test_y, test_pred) losses.append(test_loss) # mean loss mean_loss = np.mean(losses) scores[tcol].append(mean_loss) # plot df_qth = pd.DataFrame(scores).set_index(q_range) display(df_qth.style.highlight_min()) plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo'] color = ['red','green','blue'] ncols = len(plot_col) plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5)) for i, col in enumerate(plot_col): plt.subplot(1, ncols, i+1) plt.plot(df_qth.index, df_qth[col], alpha=1, color=color[i], label=col) plt.xlabel(col) plt.legend() plt.grid() plt.show()
0%| | 0/9 [00:00<?, ?it/s]
# LBも考慮して設定 vq = {'pollen_utsunomiya':20, 'pollen_chiba':20, 'pollen_tokyo':8}
testと同じ期間を時系列順に検証データとする
# run target_columns = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo'] plot_mode = True results = dict() score = [] set_seed(seed) for tcol in tqdm(target_columns): print('='*10+tcol+'='*10) train_tmp = train_df.copy() test_tmp = test_df.copy() va_preds = [] test_preds = [] losses = [] for i, year in enumerate([2017, 2018, 2019]): print(f'<year : {year}>') tr_df = train_tmp[train_tmp['datetime']<year*1e6+40101] va_df = train_tmp[(train_tmp['year']==year)&(train_tmp['month']==4)&(train_tmp['day']<15)] feature_columns = [c for c in tr_df.columns if c not in target_columns if c not in ['datetime', 'time', 'year', 'month', 'weekday']] # 補正 qth = vq[tcol] tr_df = tr_df[tr_df[tcol] <= qth].reset_index(drop=True) va_df = va_df[va_df[tcol] <= qth].reset_index(drop=True) # train / validation / test tr_x = tr_df[feature_columns] tr_y = tr_df[tcol]/4 va_x = va_df[feature_columns] va_y = va_df[tcol]/4 test_x = test_tmp[feature_columns] # training model = ModelLgb(plot=plot_mode) model.fit(tr_x, tr_y, va_x, va_y) # valid / test predict va_pred = model.predict(va_x).reshape(-1) va_pred = np.where(va_pred < 0, 0, va_pred) # post-processing va_preds.append(va_pred) test_pred = model.predict(test_x).reshape(-1) test_pred = np.where(test_pred < 0, 0, test_pred) # post-processing test_preds.append(test_pred) # valid loss va_loss = mae(va_y.values, va_pred) print(f'LOSS : {va_loss}') losses.append(va_loss) # plot valid / pred if plot_mode: plt.figure(figsize=(10,5)) plt.plot(va_y.values, label='original', linestyle='-') plt.plot(va_pred, label='pred', linestyle='-') plt.title(f'{tcol} : {va_loss}') plt.legend() plt.show() # preds preds = np.mean(test_preds, axis=0) # mean loss mean_loss = np.mean(losses) print(f'Mean LOSS : {mean_loss}\n') # save per target results[tcol] = preds score.append(mean_loss) # score print(f'Score : {np.array(score).mean()}') # Score : 1.11005112558556
==========pollen_utsunomiya========== <year : 2017>
LOSS : 1.1938153630467006
<year : 2018>
LOSS : 1.4568659271147195
<year : 2019>
LOSS : 1.5079497032585705
Mean LOSS : 1.3862103311399967 ==========pollen_chiba========== <year : 2017>
LOSS : 1.0321580800023542
LOSS : 1.3894891307069823
LOSS : 1.3890817102532345
Mean LOSS : 1.2702429736541903 ==========pollen_tokyo========== <year : 2017>