EDA & Model

Library & Data

# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm
import statsmodels
import statsmodels.api as sm

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_pinball_loss
from lightgbm import LGBMRegressor

import warnings
warnings.simplefilter('ignore')

# mount
from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive')
Mounted at /content/drive

構成

MyDrive
├<weather_merchandising>
   ├<notebook>
   │ └run.ipynb
   ├<data>
   │ ├train_data.csv
   │ ├submission.csv
   │ └test_data.csv
   └<output>
# Config
DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/weather_merchandising"
INPUT = os.path.join(DRIVE_PATH, "data")
OUTPUT = os.path.join(DRIVE_PATH, "output")

TRAIN_FILE = os.path.join(INPUT, "train_data.csv")
TEST_FILE = os.path.join(INPUT, "test_data.csv")
SUB_FILE = os.path.join(INPUT, "submission.csv")

seed =42

# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'
# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)
# Target Columns
target_columns = ['ice1', 'ice2', 'ice3', 'oden1', 'oden2', 'oden3', 'oden4', 'hot1',
       'hot2', 'hot3', 'dessert1', 'dessert2', 'dessert3', 'dessert4',
       'dessert5', 'drink1', 'drink2', 'drink3', 'drink4', 'drink5', 'drink6',
       'alcol1', 'alcol2', 'alcol3', 'snack1', 'snack2', 'snack3', 'bento1',
       'bento2', 'bento3', 'bento4', 'tild1', 'tild2', 'men1', 'men2', 'men3',
       'men4', 'men5', 'men6']

EDA

def preprocessing(df, mode='train'):
        df_tmp = df.copy()
        input_year = 2018

        df_tmp['time'] = pd.to_datetime(df_tmp.date, format='%m/%d')
        df_tmp['year'] = df_tmp['time'].dt.year
        df_tmp['month'] = df_tmp['time'].dt.month
        df_tmp['day'] = df_tmp['time'].dt.day
        if mode=='train':
            df_tmp.loc[df_tmp['month']>3, 'year'] = input_year
            df_tmp.loc[df_tmp['month']<=3, 'year'] = input_year + 1
        else:
            df_tmp['year'] = input_year + 1
        df_tmp['time'] = pd.to_datetime({'year':df_tmp.year, 'month':df_tmp.month, 'day':df_tmp.day})
        df_tmp['weekday'] = df_tmp['time'].dt.weekday
        return df_tmp

train_df = preprocessing(train, mode='train')
test_df = preprocessing(test, mode='test')
all_df = pd.concat([train_df, test_df]).reset_index(drop=True)

display(train_df.head(3))
display(test_df.head(3))
id date highest lowest rain ice1 ice2 ice3 oden1 oden2 oden3 oden4 hot1 hot2 hot3 dessert1 dessert2 dessert3 dessert4 dessert5 drink1 drink2 drink3 drink4 drink5 drink6 alcol1 alcol2 alcol3 snack1 snack2 snack3 bento1 bento2 bento3 bento4 tild1 tild2 men1 men2 men3 men4 men5 men6 time year month day weekday
0 1 4/11 21.9 12.4 0.0 25 72 26 10 23 52 35 180 254 270 42 58 50 59 67 54 45 28 49 22 8 63 51 59 26 21 35 56 46 70 27 12 12 57 30 41 38 37 35 2018-04-11 2018 4 11 2
1 2 4/12 25.9 13.9 0.0 30 85 33 9 18 42 26 202 219 235 22 36 5 28 37 69 54 35 58 22 9 77 66 72 36 32 63 8 14 23 9 5 8 19 9 13 26 4 16 2018-04-12 2018 4 12 3
2 3 4/13 20.9 11.9 0.0 21 68 28 12 22 57 31 164 210 223 20 41 5 30 32 46 38 24 45 26 9 81 69 74 36 25 57 9 12 19 6 4 9 23 9 11 33 4 13 2018-04-13 2018 4 13 4
id date highest lowest rain time year month day weekday
0 1 3/27 19.7 7.3 0.0 2019-03-27 2019 3 27 2
1 2 3/28 16.9 9.0 0.0 2019-03-28 2019 3 28 3
2 3 3/29 9.3 6.8 0.0 2019-03-29 2019 3 29 4
plot_col = [c for c in train.columns if c not in ['id', 'date', 'time', 'year', 'month', 'day', 'weekday']]
ncols = len(plot_col) // 13
plt.subplots(14, ncols, sharey=True, sharex=True, figsize=(30, 80))
for i, col in enumerate(plot_col):
    plt.subplot(14, ncols, i+1)
    plt.plot(train_df.date, train_df[col], alpha=1, color='orange', label=col)
    for x in [20,51,81,112,143,173.204,234,265,296,324,350]: # beginning of month, last line is train/test split
        plt.axvline(x)
    plt.xlabel(col)
    plt.legend()
    plt.xticks([])
plt.show()

# アイスは夏の分布が大きく異なるものがある>学習から除外
# おでんは7月から9月は提供していない>学習から除外
# ホットは周期性と小さい下降トレンド
# アイスやドリンクは気温とかなり相関があるが、ドリンクは売り上げの下限値がありそう
# 横ばいのものにも曜日の周期性は見られる

曜日の周期性を調べる

train_df.groupby('weekday').mean()[target_columns].style.background_gradient()

# アイスは0で売れる
# おでんは2,3で売れない
# ホットは1,2,5,6で売れる
# デザートは2,6で売れる
# ドリンク145は0で、ドリンク23は4で、ドリンク6は0,2,3で売れる
# アルコールは0から6にかけて売れ行きが増加
# スナックは3,4,0の順で売れている
# 弁当は1,2,5,6で大きく売れる
# チルドは2で大きく売れる
# 麺は1,2,5,6で大きく売れる
  ice1 ice2 ice3 oden1 oden2 oden3 oden4 hot1 hot2 hot3 dessert1 dessert2 dessert3 dessert4 dessert5 drink1 drink2 drink3 drink4 drink5 drink6 alcol1 alcol2 alcol3 snack1 snack2 snack3 bento1 bento2 bento3 bento4 tild1 tild2 men1 men2 men3 men4 men5 men6
weekday                                                                              
0 55.360000 74.300000 41.920000 29.720000 36.020000 69.940000 42.220000 141.800000 210.760000 213.620000 25.060000 44.580000 7.040000 29.500000 38.200000 56.900000 44.560000 28.520000 50.300000 25.340000 10.000000 42.460000 26.440000 37.440000 31.180000 24.980000 45.860000 9.900000 12.700000 20.200000 8.180000 6.720000 9.020000 21.800000 10.500000 14.340000 32.420000 4.960000 18.840000
1 41.740000 72.420000 38.680000 26.980000 36.700000 67.080000 39.460000 161.000000 236.660000 236.760000 25.000000 44.160000 7.060000 29.520000 37.980000 54.660000 43.700000 27.840000 49.320000 24.920000 9.620000 51.080000 37.920000 48.220000 21.920000 10.960000 35.260000 49.860000 42.380000 69.760000 27.420000 6.780000 9.000000 52.400000 32.120000 38.760000 41.500000 41.360000 39.600000
2 40.860000 71.380000 36.680000 27.920000 34.100000 66.380000 40.800000 159.940000 229.140000 236.620000 45.420000 65.400000 49.520000 58.140000 66.860000 53.040000 42.320000 27.380000 48.680000 24.780000 10.140000 61.820000 55.340000 58.880000 26.080000 19.720000 43.560000 50.820000 43.440000 69.160000 28.360000 14.080000 11.740000 51.800000 30.160000 38.900000 41.220000 40.200000 38.420000
3 42.900000 71.460000 37.840000 27.400000 34.980000 68.740000 39.900000 142.360000 213.920000 215.600000 24.780000 44.540000 7.020000 29.360000 38.000000 53.960000 43.320000 27.880000 48.540000 25.000000 9.900000 73.560000 67.440000 70.800000 35.160000 33.600000 54.640000 10.160000 13.640000 20.160000 8.420000 6.340000 8.980000 21.200000 10.380000 13.980000 31.140000 4.920000 18.080000
4 41.380000 70.860000 37.060000 31.440000 35.840000 71.340000 44.280000 139.720000 211.040000 212.440000 24.660000 43.380000 6.660000 29.280000 36.760000 52.760000 47.080000 31.200000 48.900000 24.760000 9.260000 80.560000 77.980000 79.700000 32.280000 25.100000 47.740000 10.100000 13.240000 19.680000 8.080000 6.520000 8.740000 21.380000 10.520000 14.120000 32.240000 5.000000 18.520000
5 43.660000 71.800000 38.500000 31.340000 37.100000 78.760000 46.180000 159.000000 232.000000 233.600000 25.400000 45.100000 7.180000 29.760000 38.280000 54.940000 43.500000 28.160000 49.440000 24.740000 9.540000 91.160000 92.380000 90.660000 21.300000 10.680000 36.420000 50.560000 44.340000 70.300000 28.400000 6.020000 8.960000 51.340000 30.040000 39.420000 41.760000 41.140000 38.860000
6 44.680000 71.680000 38.620000 30.860000 39.080000 73.280000 45.360000 155.080000 231.540000 235.220000 45.100000 65.820000 48.520000 57.980000 67.940000 54.000000 43.840000 27.980000 49.220000 24.920000 9.700000 101.520000 109.120000 103.080000 25.380000 20.020000 44.460000 49.640000 43.840000 68.900000 27.860000 7.360000 9.660000 52.280000 31.080000 40.480000 42.080000 40.840000 40.580000

日付の周期性を調べる

train_df.groupby('day').mean()[target_columns].style.background_gradient()
# 月初め、月末などに特徴があるのかもしれない
  ice1 ice2 ice3 oden1 oden2 oden3 oden4 hot1 hot2 hot3 dessert1 dessert2 dessert3 dessert4 dessert5 drink1 drink2 drink3 drink4 drink5 drink6 alcol1 alcol2 alcol3 snack1 snack2 snack3 bento1 bento2 bento3 bento4 tild1 tild2 men1 men2 men3 men4 men5 men6
day                                                                              
1 54.454545 78.545455 40.727273 29.545455 32.454545 69.454545 43.272727 146.909091 214.818182 222.363636 28.727273 47.272727 14.090909 33.090909 41.000000 65.909091 49.454545 32.090909 55.181818 24.909091 9.000000 74.272727 70.909091 71.454545 27.090909 19.818182 41.545455 31.454545 29.909091 47.636364 19.636364 6.272727 9.000000 35.818182 20.909091 26.454545 37.727273 22.818182 28.181818
2 64.090909 74.818182 43.454545 30.545455 35.727273 67.636364 50.454545 142.000000 228.000000 228.636364 31.000000 50.454545 20.636364 39.181818 47.090909 57.272727 45.272727 29.000000 48.727273 24.636364 8.727273 76.000000 70.000000 74.636364 26.181818 19.181818 40.727273 39.454545 37.272727 51.181818 23.000000 6.818182 9.181818 43.727273 24.727273 31.909091 40.727273 30.909091 32.727273
3 48.090909 74.909091 39.363636 27.636364 47.727273 65.545455 41.636364 143.545455 220.454545 219.636364 30.363636 50.181818 19.636364 39.000000 46.181818 58.272727 45.636364 29.636364 50.909091 24.818182 9.181818 74.272727 70.636364 72.000000 26.454545 22.000000 46.636364 30.454545 29.363636 44.909091 18.454545 5.727273 9.090909 37.090909 20.454545 27.181818 34.181818 23.636364 26.636364
4 40.000000 75.090909 37.090909 22.727273 28.909091 57.636364 35.818182 149.272727 228.727273 225.909091 26.727273 44.272727 13.545455 33.909091 40.272727 58.545455 46.181818 29.181818 50.545455 25.090909 9.181818 66.363636 59.727273 64.272727 28.545455 21.454545 43.181818 26.727273 26.454545 44.454545 17.000000 8.636364 9.909091 37.636364 20.727273 28.181818 36.909091 24.636364 30.000000
5 43.181818 75.636364 39.454545 25.636364 29.636364 63.272727 36.272727 159.636364 227.000000 231.818182 27.636364 45.090909 16.363636 35.636364 44.000000 58.454545 48.181818 28.818182 49.363636 24.545455 9.363636 68.818182 60.363636 68.363636 25.363636 18.363636 42.363636 39.454545 35.181818 56.454545 24.090909 8.545455 9.818182 45.272727 25.363636 33.636364 37.909091 32.363636 36.636364
6 39.090909 71.090909 38.454545 24.000000 27.909091 64.818182 36.909091 156.818182 225.545455 229.727273 30.272727 49.363636 24.090909 41.272727 46.363636 51.272727 42.000000 27.000000 50.727273 26.090909 12.454545 74.727273 75.363636 75.090909 27.818182 21.818182 42.545455 36.181818 32.181818 51.636364 21.909091 9.000000 9.818182 41.818182 21.909091 30.181818 37.272727 26.727273 31.727273
7 33.727273 72.636364 35.636364 27.000000 34.181818 68.909091 39.727273 151.545455 218.818182 226.545455 24.818182 43.090909 12.181818 33.454545 41.818182 54.363636 45.000000 28.727273 50.545455 25.090909 9.636364 69.363636 65.272727 68.000000 29.090909 24.090909 50.636364 25.545455 23.818182 39.090909 15.909091 5.909091 8.454545 30.909091 17.454545 21.636364 32.818182 16.818182 22.090909
8 35.636364 67.727273 34.818182 31.454545 38.000000 73.363636 40.727273 144.545455 221.181818 221.454545 24.909091 44.454545 13.272727 32.272727 41.181818 51.545455 45.272727 28.909091 46.818182 24.545455 9.545455 73.727273 69.000000 72.545455 27.363636 19.909091 41.090909 32.454545 31.000000 45.090909 17.272727 8.454545 10.363636 37.545455 21.909091 28.545455 39.363636 24.727273 34.090909
9 42.727273 69.454545 38.181818 35.454545 41.909091 83.454545 46.909091 142.181818 222.000000 227.000000 27.636364 46.000000 19.727273 38.272727 46.454545 53.909091 44.090909 27.545455 50.909091 26.909091 12.636364 76.272727 74.454545 74.545455 27.000000 19.545455 42.363636 38.818182 36.363636 56.636364 21.181818 10.000000 10.545455 46.272727 24.636364 33.636364 44.454545 33.272727 37.000000
10 44.000000 68.545455 37.454545 34.818182 40.818182 77.000000 44.363636 138.818182 218.181818 214.181818 27.909091 48.363636 18.909091 36.909091 46.090909 52.363636 42.727273 26.363636 49.545455 26.909091 12.909091 76.090909 76.363636 80.909091 27.909091 22.454545 46.909091 31.363636 29.545455 48.363636 19.454545 7.181818 9.545455 39.363636 21.727273 29.090909 35.545455 24.818182 28.818182
11 39.500000 67.666667 36.000000 32.916667 40.000000 75.666667 47.250000 143.166667 220.750000 217.750000 26.000000 43.416667 15.583333 34.750000 43.333333 48.916667 40.416667 29.666667 48.000000 25.583333 12.000000 68.333333 60.333333 63.333333 28.833333 19.916667 45.166667 30.083333 28.500000 42.750000 18.000000 7.833333 9.250000 36.833333 20.750000 27.666667 38.750000 24.000000 28.750000
12 29.166667 68.833333 33.416667 27.916667 45.083333 80.750000 39.666667 158.083333 227.416667 229.916667 25.500000 45.083333 14.833333 35.750000 43.500000 53.333333 41.500000 27.500000 48.083333 24.583333 10.000000 69.750000 61.083333 64.083333 28.583333 18.750000 41.333333 37.083333 33.833333 53.416667 20.250000 8.083333 9.916667 40.416667 24.250000 31.416667 39.166667 29.583333 32.583333
13 41.166667 69.166667 36.166667 27.166667 33.083333 69.083333 40.500000 158.166667 226.583333 226.750000 28.250000 49.166667 22.083333 39.416667 46.416667 51.250000 42.833333 26.833333 48.166667 24.750000 9.666667 76.083333 71.750000 72.083333 28.083333 21.583333 47.666667 35.583333 28.916667 48.083333 18.500000 7.666667 10.166667 40.000000 22.583333 28.666667 37.666667 25.666667 31.083333
14 47.833333 69.333333 41.166667 33.416667 41.666667 79.916667 48.750000 148.666667 217.666667 225.583333 23.583333 44.000000 12.000000 32.166667 40.416667 51.250000 41.500000 27.833333 47.666667 25.083333 9.000000 70.500000 63.583333 68.083333 27.750000 23.083333 51.000000 26.750000 24.250000 39.250000 16.333333 8.750000 10.583333 34.916667 19.250000 25.166667 37.750000 19.833333 27.583333
15 40.416667 67.083333 37.416667 36.250000 43.750000 78.000000 51.000000 149.916667 225.000000 230.500000 25.083333 47.666667 15.583333 34.166667 41.500000 48.083333 40.000000 25.500000 45.083333 25.166667 10.916667 76.666667 74.500000 74.916667 28.833333 20.083333 42.833333 35.333333 29.083333 49.166667 19.333333 5.666667 8.250000 38.083333 22.750000 28.916667 36.583333 25.750000 27.083333
16 39.333333 68.500000 38.333333 33.333333 38.083333 82.500000 51.000000 147.333333 223.750000 225.916667 42.833333 67.500000 30.666667 49.500000 68.166667 52.750000 41.500000 26.666667 47.250000 24.916667 9.416667 70.916667 67.250000 71.583333 28.166667 20.083333 39.666667 37.083333 33.250000 51.000000 21.500000 9.250000 10.333333 42.000000 25.916667 32.583333 42.166667 30.250000 35.333333
17 40.583333 70.916667 39.083333 32.750000 37.416667 77.000000 45.333333 152.333333 219.166667 222.750000 40.083333 62.000000 26.583333 44.416667 53.416667 51.666667 42.833333 26.416667 47.250000 24.000000 8.916667 70.166667 67.750000 70.416667 29.333333 20.666667 45.083333 32.583333 31.416667 50.166667 19.833333 6.750000 8.583333 38.000000 22.083333 26.666667 36.500000 24.166667 29.166667
18 39.416667 68.333333 36.916667 31.000000 36.166667 69.833333 45.250000 151.083333 220.500000 229.583333 35.583333 56.000000 20.083333 38.416667 46.416667 47.166667 40.083333 25.250000 44.833333 24.666667 9.250000 66.166667 58.166667 61.083333 27.750000 20.250000 45.250000 30.416667 27.083333 45.416667 18.250000 9.083333 10.083333 39.333333 21.083333 28.583333 37.250000 24.500000 31.333333
19 38.916667 72.583333 36.416667 26.250000 35.166667 68.000000 41.833333 160.416667 230.750000 231.000000 33.666667 50.500000 18.666667 36.083333 44.166667 53.166667 44.083333 28.166667 48.333333 24.416667 8.666667 67.083333 59.083333 65.333333 26.500000 20.000000 41.916667 36.083333 31.416667 53.000000 22.416667 8.416667 9.916667 40.750000 23.666667 32.000000 38.416667 29.916667 31.500000
20 33.166667 71.666667 34.666667 25.750000 33.166667 71.166667 40.500000 157.583333 218.916667 232.083333 34.916667 52.250000 23.083333 40.833333 47.916667 48.666667 41.833333 26.750000 47.916667 24.083333 9.666667 72.583333 69.916667 70.083333 26.750000 21.583333 44.750000 33.750000 31.000000 48.916667 19.250000 6.833333 8.666667 38.000000 21.750000 27.166667 35.333333 25.333333 27.500000
21 44.500000 73.750000 38.833333 26.583333 34.416667 69.166667 40.750000 158.250000 228.000000 225.000000 28.666667 46.833333 13.000000 33.833333 41.250000 56.583333 43.083333 31.083333 50.750000 24.583333 8.833333 70.583333 64.583333 70.083333 28.750000 21.916667 47.083333 27.416667 26.333333 39.083333 17.083333 6.416667 9.166667 33.750000 18.250000 24.416667 35.500000 19.583333 25.500000
22 52.000000 77.083333 39.250000 26.083333 34.000000 69.416667 41.750000 155.000000 230.833333 221.833333 28.416667 47.250000 15.750000 34.083333 41.333333 60.916667 47.583333 33.333333 51.916667 24.416667 8.416667 73.500000 68.750000 72.833333 26.000000 19.166667 41.250000 32.083333 32.083333 50.333333 19.666667 8.583333 9.250000 40.500000 23.416667 29.500000 37.750000 26.500000 32.750000
23 80.416667 69.750000 46.583333 30.416667 35.583333 69.250000 43.250000 149.083333 223.250000 224.916667 30.916667 47.500000 20.416667 37.583333 45.000000 47.083333 41.500000 26.166667 45.583333 24.333333 9.500000 73.750000 68.833333 73.333333 25.916667 19.416667 42.250000 36.750000 34.083333 54.666667 21.666667 9.833333 10.416667 43.166667 24.666667 32.416667 39.666667 29.500000 37.000000
24 42.500000 71.416667 38.083333 35.250000 39.416667 78.500000 48.333333 151.750000 218.500000 233.083333 29.500000 49.166667 18.916667 37.083333 44.583333 51.333333 41.833333 26.750000 49.416667 24.416667 9.416667 72.416667 65.166667 67.833333 29.083333 21.333333 43.500000 34.000000 31.333333 49.583333 20.416667 6.000000 9.000000 36.000000 23.500000 28.916667 33.583333 22.583333 28.750000
25 49.916667 74.250000 41.500000 31.333333 37.583333 69.666667 40.916667 145.916667 221.000000 225.166667 27.250000 45.166667 16.000000 35.333333 43.750000 57.333333 45.916667 28.583333 50.416667 25.166667 9.083333 66.583333 58.583333 62.000000 26.583333 20.333333 43.500000 29.750000 27.750000 43.833333 18.416667 6.500000 8.583333 35.500000 20.916667 25.166667 34.416667 22.416667 27.666667
26 45.500000 71.833333 38.750000 25.666667 35.333333 63.583333 37.666667 155.000000 228.500000 232.666667 42.000000 65.916667 27.666667 46.416667 65.666667 54.583333 43.916667 28.083333 48.500000 24.333333 8.916667 70.166667 61.250000 67.666667 24.500000 19.166667 40.833333 35.833333 34.000000 54.000000 21.666667 8.500000 9.583333 41.250000 23.416667 30.000000 40.333333 29.500000 33.916667
27 47.818182 73.000000 41.000000 23.818182 31.909091 60.090909 37.272727 163.000000 229.181818 229.727273 38.909091 64.000000 27.636364 43.909091 55.272727 54.818182 44.272727 27.454545 48.909091 24.818182 9.363636 74.000000 74.545455 75.363636 31.090909 22.818182 45.545455 31.272727 30.363636 50.545455 19.090909 7.181818 9.090909 38.000000 21.181818 26.454545 35.272727 23.545455 29.454545
28 32.545455 71.636364 34.272727 29.090909 34.454545 67.000000 40.636364 155.363636 224.272727 223.636364 34.000000 56.000000 17.818182 36.636364 44.090909 55.454545 43.909091 32.272727 50.636364 25.454545 10.545455 73.818182 67.363636 69.636364 28.090909 21.000000 48.272727 28.818182 27.545455 44.272727 17.000000 7.363636 9.181818 36.636364 18.909091 24.636364 37.909091 22.000000 28.454545
29 39.900000 76.000000 37.400000 23.500000 28.500000 62.000000 34.300000 152.500000 230.900000 234.000000 34.300000 53.100000 20.000000 38.500000 45.600000 61.200000 48.900000 29.300000 51.800000 24.600000 9.200000 75.300000 71.800000 71.500000 26.800000 18.600000 40.700000 38.500000 36.200000 54.100000 21.600000 6.800000 8.400000 43.900000 25.000000 30.700000 37.400000 28.800000 30.500000
30 48.400000 79.300000 40.100000 27.300000 31.100000 61.300000 37.500000 155.000000 223.600000 225.800000 34.000000 53.600000 24.400000 39.100000 48.500000 63.500000 50.200000 32.700000 53.900000 24.900000 9.000000 70.700000 63.100000 68.400000 27.300000 21.100000 41.500000 35.400000 31.700000 50.500000 20.300000 9.100000 9.700000 39.900000 22.900000 31.500000 38.200000 26.100000 32.700000
31 69.166667 75.333333 46.666667 38.500000 39.333333 80.833333 52.166667 139.833333 213.500000 212.333333 29.833333 45.833333 13.000000 34.000000 40.000000 60.500000 49.666667 29.333333 50.333333 25.333333 9.666667 61.666667 53.500000 61.333333 29.166667 25.166667 48.000000 22.333333 21.500000 34.500000 15.333333 6.666667 8.166667 29.833333 17.166667 18.333333 33.500000 15.666667 23.166667

テストデータに似ている訓練データを探す

# Adversarial Validation
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
train = train.drop(target_columns, axis=1)

train = preprocessing(train, mode='train')
test = preprocessing(test, mode='test')

train['target'] = 0
test['target'] = 1
all_df = pd.concat([train, test], axis=0)
target = all_df['target'].values

train_, test_ = train_test_split(all_df, test_size=0.33, random_state=42, shuffle=True)
train_x = train_.drop(['id', 'date', 'time', 'year', 'month', 'target'],axis=1)
test_x = test_.drop(['id', 'date', 'time', 'year', 'month', 'target'],axis=1)
train_y = train_['target'].values
test_y = test_['target'].values

lgb_train = lgb.Dataset(train_x, label=train_y)
lgb_test = lgb.Dataset(test_x, label=test_y)

param = {'metric': 'auc',
         'seed': seed,
         'verbosity': -1}

num_round = 100
clf = lgb.train(param, lgb_train, num_round, valid_sets = [lgb_test], verbose_eval=50, early_stopping_rounds = 50)

feature_imp = pd.DataFrame(sorted(zip(clf.feature_importance(), train_x.columns)), columns=['Value','Feature'])
plt.figure(figsize=(6, 5))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(500))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

display(feature_imp.sort_values(by="Value", ascending=False))
print(np.sort(clf.predict(train_x))[::-1][:30])
print(np.argsort(clf.predict(train_x))[::-1][:30])

# 似ている期間も特になし
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.819469
Early stopping, best iteration is:
[14]	valid_0's auc: 0.891593
Value Feature
4 24 lowest
3 22 highest
2 17 day
1 5 weekday
0 1 rain
[0.24907411 0.24907411 0.24907411 0.24297665 0.24297665 0.24297665
 0.24054652 0.23205524 0.23205524 0.23205524 0.23205524 0.23205524
 0.23205524 0.23205524 0.22962511 0.2220307  0.21542067 0.17226214
 0.1697324  0.1697324  0.1697324  0.1697324  0.16401879 0.15614695
 0.15614695 0.15309783 0.15309783 0.14594867 0.13259713 0.13259713]
[178 240 151  79   5 138  54 123 184 193  92 172 216 133  89  42  72  74
 119 229 162  29 171  70 209  69 201 205 112 152]

MODEL

# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)

FE

def preprocessing(df, mode='train'):
    df_tmp = df.copy()
    input_year = 2018

    df_tmp['time'] = pd.to_datetime(df_tmp.date, format='%m/%d')
    df_tmp['year'] = df_tmp['time'].dt.year
    df_tmp['month'] = df_tmp['time'].dt.month
    df_tmp['day'] = df_tmp['time'].dt.day
    if mode=='train':
        df_tmp.loc[df_tmp['month']>3, 'year'] = input_year
        df_tmp.loc[df_tmp['month']<=3, 'year'] = input_year + 1
    else:
        df_tmp['year'] = input_year + 1
    df_tmp['time'] = pd.to_datetime({'year':df_tmp.year, 'month':df_tmp.month, 'day':df_tmp.day})
    df_tmp['weekday'] = df_tmp['time'].dt.weekday
    return df_tmp

train = preprocessing(train, mode='train')
test = preprocessing(test, mode='test')
train.columns
Index(['id', 'date', 'highest', 'lowest', 'rain', 'ice1', 'ice2', 'ice3',
       'oden1', 'oden2', 'oden3', 'oden4', 'hot1', 'hot2', 'hot3', 'dessert1',
       'dessert2', 'dessert3', 'dessert4', 'dessert5', 'drink1', 'drink2',
       'drink3', 'drink4', 'drink5', 'drink6', 'alcol1', 'alcol2', 'alcol3',
       'snack1', 'snack2', 'snack3', 'bento1', 'bento2', 'bento3', 'bento4',
       'tild1', 'tild2', 'men1', 'men2', 'men3', 'men4', 'men5', 'men6',
       'time', 'year', 'month', 'day', 'weekday'],
      dtype='object')

Run

# 検証データのindexを指定
valid_index = range(297,351) # month:2,3

# 予測結果を保存する辞書型データを作成
results = dict({})
all_lgb_score = []
# 商品毎の予測を作成する
for c in tqdm(target_columns):
    # 商品特有の性質を反映させた特徴量を生成
    train_tmp = train.copy()
    test_tmp = test.copy()
    # ice
    if c in target_columns[0:3]:
        train_tmp = train_tmp[~train_tmp['month'].isin([7,8,9])]
        train_tmp['is_wday0'] = train['weekday'].isin([0]).astype(int)
        test_tmp['is_wday0'] = test['weekday'].isin([0]).astype(int)
    # oden
    elif c in target_columns[3:7]:
        train_tmp = train_tmp[(train_tmp['month'].isin([10,11,12,1,2,3]))|(train_tmp['id'].isin(valid_index))]
        train_tmp['is_wday23'] = train['weekday'].isin([2,3]).astype(int)
        test_tmp['is_wday23'] = test['weekday'].isin([2,3]).astype(int)
    # hot
    elif c in target_columns[7:10]:
        train_tmp['is_wday034'] = train['weekday'].isin([0,3,4]).astype(int)
        test_tmp['is_wday034'] = test['weekday'].isin([0,3,4]).astype(int)
    # dessert
    elif c in target_columns[10:15]:
        train_tmp['is_wday26'] = train['weekday'].isin([2,6]).astype(int)
        test_tmp['is_wday26'] = test['weekday'].isin([2,6]).astype(int)
    # drink145
    elif c in [target_columns[15],target_columns[18],target_columns[19]]:
        train_tmp['is_wday0'] = train['weekday'].isin([0]).astype(int)
        test_tmp['is_wday0'] = test['weekday'].isin([0]).astype(int)
    # drink23
    elif c in target_columns[16:18]:
        train_tmp['is_wday4'] = train['weekday'].isin([4]).astype(int)
        test_tmp['is_wday4'] = test['weekday'].isin([4]).astype(int)
    # snack
    elif c in target_columns[24:27]:
        train_tmp['is_wday4'] = train['weekday'].isin([4]).astype(int)
        train_tmp['is_wday15'] = train['weekday'].isin([1,5]).astype(int)
        test_tmp['is_wday4'] = test['weekday'].isin([4]).astype(int)
        test_tmp['is_wday15'] = test['weekday'].isin([1,5]).astype(int)
    # bento
    elif c in target_columns[27:31]:
        train_tmp['is_wday034'] = train['weekday'].isin([0,3,4]).astype(int)
        test_tmp['is_wday034'] = test['weekday'].isin([0,3,4]).astype(int)
    # tild
    elif c in target_columns[31:33]:
        train_tmp['is_wday2'] = train['weekday'].isin([2]).astype(int)
        test_tmp['is_wday2'] = test['weekday'].isin([2]).astype(int)
    # men
    elif c in target_columns[33:39]:
        train_tmp['is_wday034'] = train['weekday'].isin([0,3,4]).astype(int)
        test_tmp['is_wday034'] = test['weekday'].isin([0,3,4]).astype(int)

    train_columns = [c for c in train_tmp.columns if c not in target_columns if c not in ['date', 'time']]

    x_train = train_tmp[~train_tmp['id'].isin(valid_index)][train_columns]
    y_train = train_tmp[~train_tmp['id'].isin(valid_index)][c]
    x_valid = train_tmp[train_tmp['id'].isin(valid_index)][train_columns]
    y_valid = train_tmp[train_tmp['id'].isin(valid_index)][c]
    x_test = test_tmp[train_columns]

    x_train = x_train.drop(['id'],axis=1)
    x_valid = x_valid.drop(['id'],axis=1) 
    x_test = x_test.drop(['id'],axis=1)

    # 分位点を設定
    qs = np.array([0.01, 0.1, 0.5, 0.9, 0.99])
    
    lgb_scores = []
    # 分位点毎に予測を作成する
    for q in qs:
        # モデルのインスタンスを作成
        lgb = LGBMRegressor(
            objective='quantile',
            alpha=q,
            n_estimators=10000,
            max_depth=2,
            colsample_bytree=0.9,
            random_state=seed)
        #学習を実施
        lgb.fit(x_train, y_train, eval_set=(x_valid, y_valid), early_stopping_rounds=100, verbose=False)
        #予測を実施
        lgb_scores.append(lgb.best_score_['valid_0']['quantile'])  # validationのbest_score
        pred_y = lgb.predict(x_test)
        #予測結果を格納
        results[(c, q)] = pred_y

    quantiles = [0.01, 0.1, 0.5, 0.9, 0.99]
    lgb_scores = []
    for i, q in enumerate(quantiles):
        lgb = LGBMRegressor(
            objective='quantile',
            alpha=q,
            n_estimators=10000,
            max_depth=2,
            colsample_bytree=0.9,
            random_state=seed)
        lgb.fit(x_train, y_train, eval_set=(x_valid, y_valid), early_stopping_rounds=100, verbose=False)
        lgb_scores.append(lgb.best_score_['valid_0']['quantile'])  # validationのbest_score
    all_lgb_score.append(lgb_scores)
# Score
print(np.array(all_lgb_score).mean())
  0%|          | 0/39 [00:00<?, ?it/s]
0.8462863293289724

Submit

# 商品_分位点毎のリストに変換
submit_rows = [[f'{k[0]}_{k[1]}']+ v.tolist() for k, v in results.items()]
# テスト結果の出力
submit_df = pd.DataFrame(np.array(submit_rows)[:, 1:22].astype(float).round(), index=np.array(submit_rows)[:, 0])
submit_df.columns = list(range(1, 22))
submit_df = submit_df.transpose()
submit_df.index.name = 'id'
submit_df.to_csv(os.path.join(OUTPUT, "sub_exp00.csv"))
display(submit_df.head())
ice1_0.01 ice1_0.1 ice1_0.5 ice1_0.9 ice1_0.99 ice2_0.01 ice2_0.1 ice2_0.5 ice2_0.9 ice2_0.99 ice3_0.01 ice3_0.1 ice3_0.5 ice3_0.9 ice3_0.99 oden1_0.01 oden1_0.1 oden1_0.5 oden1_0.9 oden1_0.99 oden2_0.01 oden2_0.1 oden2_0.5 oden2_0.9 oden2_0.99 oden3_0.01 oden3_0.1 oden3_0.5 oden3_0.9 oden3_0.99 oden4_0.01 oden4_0.1 oden4_0.5 oden4_0.9 oden4_0.99 hot1_0.01 hot1_0.1 hot1_0.5 hot1_0.9 hot1_0.99 hot2_0.01 hot2_0.1 hot2_0.5 hot2_0.9 hot2_0.99 hot3_0.01 hot3_0.1 hot3_0.5 hot3_0.9 hot3_0.99 dessert1_0.01 dessert1_0.1 dessert1_0.5 dessert1_0.9 dessert1_0.99 dessert2_0.01 dessert2_0.1 dessert2_0.5 dessert2_0.9 dessert2_0.99 dessert3_0.01 dessert3_0.1 dessert3_0.5 dessert3_0.9 dessert3_0.99 dessert4_0.01 dessert4_0.1 dessert4_0.5 dessert4_0.9 dessert4_0.99 dessert5_0.01 dessert5_0.1 dessert5_0.5 dessert5_0.9 dessert5_0.99 drink1_0.01 drink1_0.1 drink1_0.5 drink1_0.9 drink1_0.99 drink2_0.01 drink2_0.1 drink2_0.5 drink2_0.9 drink2_0.99 drink3_0.01 drink3_0.1 drink3_0.5 drink3_0.9 drink3_0.99 drink4_0.01 drink4_0.1 drink4_0.5 drink4_0.9 drink4_0.99 drink5_0.01 drink5_0.1 drink5_0.5 drink5_0.9 drink5_0.99 drink6_0.01 drink6_0.1 drink6_0.5 drink6_0.9 drink6_0.99 alcol1_0.01 alcol1_0.1 alcol1_0.5 alcol1_0.9 alcol1_0.99 alcol2_0.01 alcol2_0.1 alcol2_0.5 alcol2_0.9 alcol2_0.99 alcol3_0.01 alcol3_0.1 alcol3_0.5 alcol3_0.9 alcol3_0.99 snack1_0.01 snack1_0.1 snack1_0.5 snack1_0.9 snack1_0.99 snack2_0.01 snack2_0.1 snack2_0.5 snack2_0.9 snack2_0.99 snack3_0.01 snack3_0.1 snack3_0.5 snack3_0.9 snack3_0.99 bento1_0.01 bento1_0.1 bento1_0.5 bento1_0.9 bento1_0.99 bento2_0.01 bento2_0.1 bento2_0.5 bento2_0.9 bento2_0.99 bento3_0.01 bento3_0.1 bento3_0.5 bento3_0.9 bento3_0.99 bento4_0.01 bento4_0.1 bento4_0.5 bento4_0.9 bento4_0.99 tild1_0.01 tild1_0.1 tild1_0.5 tild1_0.9 tild1_0.99 tild2_0.01 tild2_0.1 tild2_0.5 tild2_0.9 tild2_0.99 men1_0.01 men1_0.1 men1_0.5 men1_0.9 men1_0.99 men2_0.01 men2_0.1 men2_0.5 men2_0.9 men2_0.99 men3_0.01 men3_0.1 men3_0.5 men3_0.9 men3_0.99 men4_0.01 men4_0.1 men4_0.5 men4_0.9 men4_0.99 men5_0.01 men5_0.1 men5_0.5 men5_0.9 men5_0.99 men6_0.01 men6_0.1 men6_0.5 men6_0.9 men6_0.99
id
1 16.0 19.0 20.0 21.0 28.0 51.0 60.0 60.0 63.0 90.0 24.0 26.0 30.0 33.0 38.0 10.0 34.0 45.0 52.0 103.0 46.0 45.0 59.0 58.0 129.0 82.0 91.0 108.0 150.0 249.0 55.0 59.0 63.0 68.0 147.0 88.0 139.0 158.0 174.0 217.0 178.0 216.0 219.0 234.0 269.0 214.0 223.0 236.0 257.0 281.0 20.0 49.0 53.0 53.0 57.0 46.0 67.0 78.0 81.0 78.0 11.0 15.0 55.0 62.0 58.0 25.0 53.0 62.0 65.0 66.0 56.0 67.0 73.0 81.0 87.0 28.0 29.0 36.0 40.0 68.0 18.0 28.0 29.0 34.0 49.0 12.0 18.0 22.0 23.0 33.0 30.0 36.0 37.0 40.0 49.0 22.0 22.0 24.0 26.0 30.0 7.0 7.0 8.0 8.0 16.0 54.0 56.0 61.0 64.0 76.0 31.0 45.0 47.0 52.0 75.0 52.0 54.0 57.0 61.0 66.0 18.0 22.0 27.0 32.0 40.0 9.0 17.0 21.0 24.0 37.0 26.0 37.0 43.0 53.0 59.0 23.0 44.0 51.0 54.0 74.0 31.0 37.0 43.0 48.0 49.0 15.0 62.0 70.0 78.0 81.0 8.0 24.0 28.0 30.0 33.0 4.0 11.0 13.0 20.0 21.0 8.0 10.0 11.0 16.0 18.0 26.0 43.0 50.0 63.0 66.0 21.0 24.0 30.0 34.0 35.0 27.0 31.0 37.0 45.0 54.0 26.0 33.0 40.0 48.0 59.0 30.0 32.0 41.0 46.0 51.0 13.0 31.0 37.0 51.0 61.0
2 16.0 18.0 20.0 21.0 29.0 51.0 54.0 57.0 58.0 90.0 24.0 26.0 30.0 32.0 38.0 10.0 14.0 25.0 48.0 103.0 27.0 27.0 37.0 50.0 129.0 63.0 65.0 69.0 143.0 249.0 37.0 41.0 54.0 56.0 147.0 88.0 138.0 147.0 159.0 212.0 178.0 211.0 225.0 228.0 256.0 202.0 206.0 222.0 238.0 281.0 20.0 29.0 31.0 33.0 39.0 37.0 49.0 54.0 55.0 66.0 5.0 5.0 10.0 10.0 18.0 25.0 28.0 31.0 35.0 48.0 32.0 38.0 38.0 46.0 65.0 24.0 23.0 27.0 30.0 69.0 18.0 26.0 28.0 29.0 49.0 12.0 15.0 20.0 25.0 33.0 30.0 30.0 33.0 39.0 49.0 22.0 22.0 23.0 26.0 30.0 7.0 7.0 8.0 8.0 16.0 64.0 66.0 73.0 76.0 86.0 31.0 57.0 60.0 67.0 77.0 62.0 63.0 69.0 78.0 90.0 18.0 29.0 35.0 38.0 40.0 9.0 27.0 32.0 36.0 37.0 26.0 43.0 52.0 60.0 65.0 8.0 9.0 10.0 12.0 31.0 11.0 12.0 14.0 18.0 22.0 15.0 17.0 21.0 23.0 24.0 6.0 7.0 8.0 10.0 14.0 4.0 4.0 5.0 15.0 16.0 7.0 7.0 9.0 13.0 18.0 16.0 18.0 21.0 30.0 47.0 8.0 8.0 10.0 15.0 16.0 11.0 11.0 14.0 22.0 54.0 24.0 26.0 30.0 41.0 59.0 3.0 4.0 4.0 11.0 16.0 13.0 14.0 16.0 36.0 61.0
3 14.0 15.0 16.0 18.0 29.0 25.0 34.0 38.0 39.0 90.0 24.0 25.0 29.0 32.0 38.0 10.0 48.0 49.0 54.0 103.0 53.0 63.0 56.0 60.0 129.0 125.0 129.0 112.0 184.0 372.0 68.0 70.0 69.0 80.0 147.0 84.0 125.0 133.0 149.0 212.0 178.0 198.0 208.0 220.0 256.0 195.0 206.0 220.0 232.0 257.0 20.0 25.0 28.0 31.0 38.0 37.0 43.0 47.0 51.0 66.0 5.0 5.0 7.0 8.0 18.0 25.0 27.0 28.0 33.0 48.0 32.0 34.0 34.0 46.0 65.0 16.0 18.0 19.0 22.0 70.0 17.0 22.0 23.0 28.0 54.0 11.0 14.0 15.0 39.0 53.0 27.0 28.0 30.0 33.0 49.0 22.0 23.0 24.0 30.0 33.0 7.0 8.0 9.0 19.0 22.0 73.0 74.0 79.0 85.0 89.0 31.0 69.0 72.0 81.0 83.0 70.0 74.0 78.0 84.0 90.0 20.0 27.0 30.0 37.0 44.0 9.0 22.0 25.0 27.0 37.0 26.0 44.0 49.0 60.0 65.0 8.0 9.0 10.0 12.0 28.0 11.0 12.0 14.0 18.0 22.0 16.0 16.0 21.0 23.0 22.0 6.0 7.0 8.0 10.0 14.0 4.0 4.0 5.0 15.0 20.0 6.0 7.0 9.0 13.0 18.0 17.0 18.0 21.0 30.0 57.0 9.0 9.0 10.0 14.0 16.0 11.0 11.0 14.0 22.0 54.0 24.0 27.0 30.0 41.0 59.0 3.0 4.0 4.0 8.0 16.0 13.0 14.0 16.0 34.0 61.0
4 16.0 16.0 17.0 19.0 29.0 39.0 41.0 42.0 43.0 90.0 24.0 25.0 28.0 32.0 38.0 10.0 38.0 44.0 54.0 103.0 51.0 50.0 57.0 60.0 129.0 97.0 99.0 112.0 176.0 250.0 60.0 64.0 67.0 78.0 147.0 88.0 132.0 153.0 165.0 216.0 178.0 215.0 224.0 231.0 268.0 208.0 225.0 238.0 251.0 257.0 20.0 25.0 28.0 32.0 39.0 37.0 43.0 47.0 51.0 67.0 5.0 5.0 7.0 8.0 18.0 25.0 27.0 30.0 33.0 48.0 32.0 34.0 35.0 46.0 65.0 14.0 14.0 14.0 21.0 69.0 18.0 19.0 19.0 25.0 56.0 11.0 11.0 11.0 16.0 29.0 27.0 28.0 30.0 33.0 48.0 23.0 24.0 25.0 27.0 30.0 8.0 10.0 11.0 11.0 16.0 73.0 83.0 89.0 98.0 98.0 31.0 86.0 92.0 103.0 110.0 70.0 84.0 90.0 98.0 98.0 17.0 17.0 22.0 28.0 42.0 9.0 10.0 11.0 13.0 37.0 26.0 32.0 35.0 45.0 65.0 23.0 48.0 51.0 54.0 71.0 35.0 37.0 45.0 49.0 56.0 15.0 61.0 71.0 75.0 77.0 8.0 26.0 28.0 31.0 33.0 4.0 4.0 5.0 9.0 18.0 7.0 7.0 8.0 9.0 18.0 26.0 45.0 49.0 56.0 77.0 22.0 25.0 30.0 35.0 34.0 27.0 33.0 38.0 41.0 54.0 26.0 34.0 40.0 43.0 59.0 30.0 35.0 40.0 39.0 50.0 13.0 31.0 36.0 38.0 63.0
5 16.0 17.0 18.0 20.0 29.0 49.0 50.0 52.0 54.0 90.0 24.0 26.0 30.0 32.0 38.0 10.0 33.0 42.0 52.0 103.0 46.0 48.0 57.0 60.0 129.0 82.0 91.0 112.0 157.0 250.0 59.0 62.0 68.0 74.0 147.0 88.0 136.0 155.0 171.0 216.0 178.0 214.0 221.0 235.0 267.0 214.0 223.0 237.0 256.0 281.0 20.0 45.0 47.0 51.0 57.0 46.0 60.0 67.0 71.0 84.0 11.0 13.0 48.0 52.0 60.0 25.0 52.0 59.0 62.0 66.0 56.0 62.0 66.0 69.0 83.0 21.0 16.0 19.0 25.0 69.0 18.0 20.0 22.0 25.0 56.0 12.0 12.0 15.0 17.0 32.0 30.0 30.0 32.0 34.0 42.0 23.0 24.0 25.0 27.0 30.0 8.0 9.0 9.0 10.0 16.0 72.0 90.0 98.0 110.0 110.0 31.0 96.0 103.0 115.0 115.0 70.0 93.0 99.0 111.0 114.0 18.0 23.0 25.0 33.0 40.0 9.0 18.0 20.0 22.0 37.0 26.0 38.0 44.0 54.0 65.0 23.0 47.0 51.0 54.0 74.0 34.0 37.0 44.0 49.0 52.0 15.0 63.0 70.0 75.0 77.0 8.0 26.0 28.0 30.0 33.0 4.0 4.0 5.0 9.0 18.0 7.0 7.0 8.0 9.0 18.0 26.0 45.0 49.0 56.0 71.0 21.0 27.0 30.0 36.0 34.0 27.0 33.0 38.0 43.0 54.0 26.0 34.0 40.0 44.0 59.0 30.0 35.0 40.0 41.0 45.0 13.0 31.0 36.0 40.0 63.0

目視確認

pred_median_col = [c for c in submit_df.columns if '_0.5' in c]
test_add_pred = test.merge(submit_df[pred_median_col].reset_index(), on='id', how='left')
test_add_pred.columns = [c.replace('_0.5', '') if '_0.5' in c else c for c in test_add_pred.columns]
test_add_pred[target_columns] = test_add_pred[target_columns].astype(float)
test_add_pred.head(3)
id date highest lowest rain time year month day weekday ice1 ice2 ice3 oden1 oden2 oden3 oden4 hot1 hot2 hot3 dessert1 dessert2 dessert3 dessert4 dessert5 drink1 drink2 drink3 drink4 drink5 drink6 alcol1 alcol2 alcol3 snack1 snack2 snack3 bento1 bento2 bento3 bento4 tild1 tild2 men1 men2 men3 men4 men5 men6
0 1 3/27 19.7 7.3 0.0 2019-03-27 2019 3 27 2 20.0 60.0 30.0 45.0 59.0 108.0 63.0 158.0 219.0 236.0 53.0 78.0 55.0 62.0 73.0 36.0 29.0 22.0 37.0 24.0 8.0 61.0 47.0 57.0 27.0 21.0 43.0 51.0 43.0 70.0 28.0 13.0 11.0 50.0 30.0 37.0 40.0 41.0 37.0
1 2 3/28 16.9 9.0 0.0 2019-03-28 2019 3 28 3 20.0 57.0 30.0 25.0 37.0 69.0 54.0 147.0 225.0 222.0 31.0 54.0 10.0 31.0 38.0 27.0 28.0 20.0 33.0 23.0 8.0 73.0 60.0 69.0 35.0 32.0 52.0 10.0 14.0 21.0 8.0 5.0 9.0 21.0 10.0 14.0 30.0 4.0 16.0
2 3 3/29 9.3 6.8 0.0 2019-03-29 2019 3 29 4 16.0 38.0 29.0 49.0 56.0 112.0 69.0 133.0 208.0 220.0 28.0 47.0 7.0 28.0 34.0 19.0 23.0 15.0 30.0 24.0 9.0 79.0 72.0 78.0 30.0 25.0 49.0 10.0 14.0 21.0 8.0 5.0 9.0 21.0 10.0 14.0 30.0 4.0 16.0
# 移動平均
window=7
train_rolling = train.rolling(window, min_periods=1).mean()
test_rolling = test_add_pred.rolling(window, min_periods=1).mean()
all_rolling = pd.concat([train, test_add_pred], axis=0).reset_index(drop=True).rolling(window, min_periods=1).mean()
# 目視確認
plot_col = [c for c in train.columns if c not in ['id', 'date', 'time', 'year', 'month', 'day', 'weekday']]
ncols = len(plot_col) // 13
plt.subplots(14, ncols, sharey=True, sharex=True, figsize=(30, 80))
for i, col in enumerate(plot_col):
    plt.subplot(14, ncols, i+1)
    plt.plot(train_rolling.index[window:], train_rolling[col][window:], alpha=1, color='blue', label='train')
    plt.plot(all_rolling.index[-len(test_rolling):], all_rolling[col][-len(test_rolling):], alpha=1, color='red', label='test')
    for x in [20,51,81,112,143,173.204,234,265,296,324,350]:
        plt.axvline(x)
    plt.xlabel(col)
    plt.legend()
    plt.xticks([])
plt.show()

Baseline

sub_base19
MODEL

  • MODEL = LightGBMReggressor(objective='quantile')
  • seed = 42
  • year = 2018
  • n_estimators=10000
  • max_depth=2
  • colsample_bytree=0.9

FE

  • validation : month2,3
  • ice: del month7-9, add is_wday0
  • oden: del month7-9, add is_wday23
  • hot: add is_wday034
  • dessert: add is_wday26
  • drink145: add is_wday0
  • drink23: add is_wday4
  • snack: add is_wday4, is_wday15
  • bento: add is_wday034
  • tild: add is_wday2
  • men: add is_wday034

Post-Processing

  • change integer

Score: 0.8462863293289724

second sub_base17

  • add quarter

添付データ

  • run.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241221T132112Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Aws4 request&x amz signedheaders=host&x amz signature=27a7cae890c77b945771fb3a67f38ce338a868bdce08ff230fe75a2cbfe1c17e
    Sard

    ご共有ありがとうございます。
    2018年だと判断した理由を教えて頂いてもよろしいでしょうか。

    Aws4 request&x amz signedheaders=host&x amz signature=a1ba1a34f50eec146eac804d1e574449380f45a7127022165de435e8321f8891
    kotrying

    最初実データを使用していると思っていたため、コロナ流行の影響を考慮する必要があるかもしれないと考え、まずは影響が及んでいない最近である2018年をベースラインに選択しました。
    その後トピックにてデータがシュミレーションデータを使用していると知ったのですが、最初に設定した2018年を前提に分析や特徴量作成をしていたため、変更していないだけ、という形になります。
    なので2018年の設定自体に重要な意味はありません。

    Aws4 request&x amz signedheaders=host&x amz signature=27a7cae890c77b945771fb3a67f38ce338a868bdce08ff230fe75a2cbfe1c17e
    Sard

    kotrying-san ご説明ありがとうございます。理解しました。

    Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。