EDA & Model

Library & Data

# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm
import statsmodels
import statsmodels.api as sm

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_pinball_loss
from lightgbm import LGBMRegressor

import warnings
warnings.simplefilter('ignore')

# mount
from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive')

Mounted at /content/drive

構成

MyDrive
├<weather_merchandising>
　　　├<notebook>
　　　│　└run.ipynb
　　　├<data>
　　　│　├train_data.csv
　　　│　├submission.csv
　　　│　└test_data.csv
　　　└<output>

# Config
DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/weather_merchandising"
INPUT = os.path.join(DRIVE_PATH, "data")
OUTPUT = os.path.join(DRIVE_PATH, "output")

TRAIN_FILE = os.path.join(INPUT, "train_data.csv")
TEST_FILE = os.path.join(INPUT, "test_data.csv")
SUB_FILE = os.path.join(INPUT, "submission.csv")

seed =42

# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'

# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)

# Target Columns
target_columns = ['ice1', 'ice2', 'ice3', 'oden1', 'oden2', 'oden3', 'oden4', 'hot1',
       'hot2', 'hot3', 'dessert1', 'dessert2', 'dessert3', 'dessert4',
       'dessert5', 'drink1', 'drink2', 'drink3', 'drink4', 'drink5', 'drink6',
       'alcol1', 'alcol2', 'alcol3', 'snack1', 'snack2', 'snack3', 'bento1',
       'bento2', 'bento3', 'bento4', 'tild1', 'tild2', 'men1', 'men2', 'men3',
       'men4', 'men5', 'men6']

EDA

def preprocessing(df, mode='train'):
        df_tmp = df.copy()
        input_year = 2018

        df_tmp['time'] = pd.to_datetime(df_tmp.date, format='%m/%d')
        df_tmp['year'] = df_tmp['time'].dt.year
        df_tmp['month'] = df_tmp['time'].dt.month
        df_tmp['day'] = df_tmp['time'].dt.day
        if mode=='train':
            df_tmp.loc[df_tmp['month']>3, 'year'] = input_year
            df_tmp.loc[df_tmp['month']<=3, 'year'] = input_year + 1
        else:
            df_tmp['year'] = input_year + 1
        df_tmp['time'] = pd.to_datetime({'year':df_tmp.year, 'month':df_tmp.month, 'day':df_tmp.day})
        df_tmp['weekday'] = df_tmp['time'].dt.weekday
        return df_tmp

train_df = preprocessing(train, mode='train')
test_df = preprocessing(test, mode='test')
all_df = pd.concat([train_df, test_df]).reset_index(drop=True)

display(train_df.head(3))
display(test_df.head(3))

	id	date	highest	lowest	ice1	ice2	ice3	oden1	oden2	oden3	oden4	hot1	hot2	hot3	dessert1	dessert2	dessert3	dessert4	dessert5	drink1	drink2	drink3	drink4	drink5	drink6	alcol1	alcol2	alcol3	snack1	snack2	snack3	bento1	bento2	bento3	bento4	tild1	tild2	men1	men2	men3	men4	men5	men6	time	year	month	day	weekday
0	1	4/11	21.9	12.4	25	72	26	10	23	52	35	180	254	270	42	58	50	59	67	54	45	28	49	22	8	63	51	59	26	21	35	56	46	70	27	12	12	57	30	41	38	37	35	2018-04-11	2018	4	11	2
1	2	4/12	25.9	13.9	30	85	33	9	18	42	26	202	219	235	22	36	5	28	37	69	54	35	58	22	9	77	66	72	36	32	63	8	14	23	9	5	8	19	9	13	26	4	16	2018-04-12	2018	4	12	3
2	3	4/13	20.9	11.9	21	68	28	12	22	57	31	164	210	223	20	41	5	30	32	46	38	24	45	26	9	81	69	74	36	25	57	9	12	19	6	4	9	23	9	11	33	4	13	2018-04-13	2018	4	13	4

	id	date	highest	lowest	time	year	month	day	weekday
0	1	3/27	19.7	7.3	2019-03-27	2019	3	27	2
1	2	3/28	16.9	9.0	2019-03-28	2019	3	28	3
2	3	3/29	9.3	6.8	2019-03-29	2019	3	29	4

plot_col = [c for c in train.columns if c not in ['id', 'date', 'time', 'year', 'month', 'day', 'weekday']]
ncols = len(plot_col) // 13
plt.subplots(14, ncols, sharey=True, sharex=True, figsize=(30, 80))
for i, col in enumerate(plot_col):
    plt.subplot(14, ncols, i+1)
    plt.plot(train_df.date, train_df[col], alpha=1, color='orange', label=col)
    for x in [20,51,81,112,143,173.204,234,265,296,324,350]: # beginning of month, last line is train/test split
        plt.axvline(x)
    plt.xlabel(col)
    plt.legend()
    plt.xticks([])
plt.show()

# アイスは夏の分布が大きく異なるものがある＞学習から除外
# おでんは7月から9月は提供していない＞学習から除外
# ホットは周期性と小さい下降トレンド
# アイスやドリンクは気温とかなり相関があるが、ドリンクは売り上げの下限値がありそう
# 横ばいのものにも曜日の周期性は見られる

曜日の周期性を調べる

train_df.groupby('weekday').mean()[target_columns].style.background_gradient()

# アイスは0で売れる
# おでんは2,3で売れない
# ホットは1,2,5,6で売れる
# デザートは2,6で売れる
# ドリンク１４５は0で、ドリンク２３は4で、ドリンク６は0,2,3で売れる
# アルコールは0から6にかけて売れ行きが増加
# スナックは3,4,0の順で売れている
# 弁当は1,2,5,6で大きく売れる
# チルドは2で大きく売れる
# 麺は1,2,5,6で大きく売れる

	ice1	ice2	ice3	oden1	oden2	oden3	oden4	hot1	hot2	hot3	dessert1	dessert2	dessert3	dessert4	dessert5	drink1	drink2	drink3	drink4	drink5	drink6	alcol1	alcol2	alcol3	snack1	snack2	snack3	bento1	bento2	bento3	bento4	tild1	tild2	men1	men2	men3	men4	men5	men6
weekday
0	55.360000	74.300000	41.920000	29.720000	36.020000	69.940000	42.220000	141.800000	210.760000	213.620000	25.060000	44.580000	7.040000	29.500000	38.200000	56.900000	44.560000	28.520000	50.300000	25.340000	10.000000	42.460000	26.440000	37.440000	31.180000	24.980000	45.860000	9.900000	12.700000	20.200000	8.180000	6.720000	9.020000	21.800000	10.500000	14.340000	32.420000	4.960000	18.840000
1	41.740000	72.420000	38.680000	26.980000	36.700000	67.080000	39.460000	161.000000	236.660000	236.760000	25.000000	44.160000	7.060000	29.520000	37.980000	54.660000	43.700000	27.840000	49.320000	24.920000	9.620000	51.080000	37.920000	48.220000	21.920000	10.960000	35.260000	49.860000	42.380000	69.760000	27.420000	6.780000	9.000000	52.400000	32.120000	38.760000	41.500000	41.360000	39.600000
2	40.860000	71.380000	36.680000	27.920000	34.100000	66.380000	40.800000	159.940000	229.140000	236.620000	45.420000	65.400000	49.520000	58.140000	66.860000	53.040000	42.320000	27.380000	48.680000	24.780000	10.140000	61.820000	55.340000	58.880000	26.080000	19.720000	43.560000	50.820000	43.440000	69.160000	28.360000	14.080000	11.740000	51.800000	30.160000	38.900000	41.220000	40.200000	38.420000
3	42.900000	71.460000	37.840000	27.400000	34.980000	68.740000	39.900000	142.360000	213.920000	215.600000	24.780000	44.540000	7.020000	29.360000	38.000000	53.960000	43.320000	27.880000	48.540000	25.000000	9.900000	73.560000	67.440000	70.800000	35.160000	33.600000	54.640000	10.160000	13.640000	20.160000	8.420000	6.340000	8.980000	21.200000	10.380000	13.980000	31.140000	4.920000	18.080000
4	41.380000	70.860000	37.060000	31.440000	35.840000	71.340000	44.280000	139.720000	211.040000	212.440000	24.660000	43.380000	6.660000	29.280000	36.760000	52.760000	47.080000	31.200000	48.900000	24.760000	9.260000	80.560000	77.980000	79.700000	32.280000	25.100000	47.740000	10.100000	13.240000	19.680000	8.080000	6.520000	8.740000	21.380000	10.520000	14.120000	32.240000	5.000000	18.520000
5	43.660000	71.800000	38.500000	31.340000	37.100000	78.760000	46.180000	159.000000	232.000000	233.600000	25.400000	45.100000	7.180000	29.760000	38.280000	54.940000	43.500000	28.160000	49.440000	24.740000	9.540000	91.160000	92.380000	90.660000	21.300000	10.680000	36.420000	50.560000	44.340000	70.300000	28.400000	6.020000	8.960000	51.340000	30.040000	39.420000	41.760000	41.140000	38.860000
6	44.680000	71.680000	38.620000	30.860000	39.080000	73.280000	45.360000	155.080000	231.540000	235.220000	45.100000	65.820000	48.520000	57.980000	67.940000	54.000000	43.840000	27.980000	49.220000	24.920000	9.700000	101.520000	109.120000	103.080000	25.380000	20.020000	44.460000	49.640000	43.840000	68.900000	27.860000	7.360000	9.660000	52.280000	31.080000	40.480000	42.080000	40.840000	40.580000

日付の周期性を調べる

train_df.groupby('day').mean()[target_columns].style.background_gradient()
# 月初め、月末などに特徴があるのかもしれない

	ice1	ice2	ice3	oden1	oden2	oden3	oden4	hot1	hot2	hot3	dessert1	dessert2	dessert3	dessert4	dessert5	drink1	drink2	drink3	drink4	drink5	drink6	alcol1	alcol2	alcol3	snack1	snack2	snack3	bento1	bento2	bento3	bento4	tild1	tild2	men1	men2	men3	men4	men5	men6
day
1	54.454545	78.545455	40.727273	29.545455	32.454545	69.454545	43.272727	146.909091	214.818182	222.363636	28.727273	47.272727	14.090909	33.090909	41.000000	65.909091	49.454545	32.090909	55.181818	24.909091	9.000000	74.272727	70.909091	71.454545	27.090909	19.818182	41.545455	31.454545	29.909091	47.636364	19.636364	6.272727	9.000000	35.818182	20.909091	26.454545	37.727273	22.818182	28.181818
2	64.090909	74.818182	43.454545	30.545455	35.727273	67.636364	50.454545	142.000000	228.000000	228.636364	31.000000	50.454545	20.636364	39.181818	47.090909	57.272727	45.272727	29.000000	48.727273	24.636364	8.727273	76.000000	70.000000	74.636364	26.181818	19.181818	40.727273	39.454545	37.272727	51.181818	23.000000	6.818182	9.181818	43.727273	24.727273	31.909091	40.727273	30.909091	32.727273
3	48.090909	74.909091	39.363636	27.636364	47.727273	65.545455	41.636364	143.545455	220.454545	219.636364	30.363636	50.181818	19.636364	39.000000	46.181818	58.272727	45.636364	29.636364	50.909091	24.818182	9.181818	74.272727	70.636364	72.000000	26.454545	22.000000	46.636364	30.454545	29.363636	44.909091	18.454545	5.727273	9.090909	37.090909	20.454545	27.181818	34.181818	23.636364	26.636364
4	40.000000	75.090909	37.090909	22.727273	28.909091	57.636364	35.818182	149.272727	228.727273	225.909091	26.727273	44.272727	13.545455	33.909091	40.272727	58.545455	46.181818	29.181818	50.545455	25.090909	9.181818	66.363636	59.727273	64.272727	28.545455	21.454545	43.181818	26.727273	26.454545	44.454545	17.000000	8.636364	9.909091	37.636364	20.727273	28.181818	36.909091	24.636364	30.000000
5	43.181818	75.636364	39.454545	25.636364	29.636364	63.272727	36.272727	159.636364	227.000000	231.818182	27.636364	45.090909	16.363636	35.636364	44.000000	58.454545	48.181818	28.818182	49.363636	24.545455	9.363636	68.818182	60.363636	68.363636	25.363636	18.363636	42.363636	39.454545	35.181818	56.454545	24.090909	8.545455	9.818182	45.272727	25.363636	33.636364	37.909091	32.363636	36.636364
6	39.090909	71.090909	38.454545	24.000000	27.909091	64.818182	36.909091	156.818182	225.545455	229.727273	30.272727	49.363636	24.090909	41.272727	46.363636	51.272727	42.000000	27.000000	50.727273	26.090909	12.454545	74.727273	75.363636	75.090909	27.818182	21.818182	42.545455	36.181818	32.181818	51.636364	21.909091	9.000000	9.818182	41.818182	21.909091	30.181818	37.272727	26.727273	31.727273
7	33.727273	72.636364	35.636364	27.000000	34.181818	68.909091	39.727273	151.545455	218.818182	226.545455	24.818182	43.090909	12.181818	33.454545	41.818182	54.363636	45.000000	28.727273	50.545455	25.090909	9.636364	69.363636	65.272727	68.000000	29.090909	24.090909	50.636364	25.545455	23.818182	39.090909	15.909091	5.909091	8.454545	30.909091	17.454545	21.636364	32.818182	16.818182	22.090909
8	35.636364	67.727273	34.818182	31.454545	38.000000	73.363636	40.727273	144.545455	221.181818	221.454545	24.909091	44.454545	13.272727	32.272727	41.181818	51.545455	45.272727	28.909091	46.818182	24.545455	9.545455	73.727273	69.000000	72.545455	27.363636	19.909091	41.090909	32.454545	31.000000	45.090909	17.272727	8.454545	10.363636	37.545455	21.909091	28.545455	39.363636	24.727273	34.090909
9	42.727273	69.454545	38.181818	35.454545	41.909091	83.454545	46.909091	142.181818	222.000000	227.000000	27.636364	46.000000	19.727273	38.272727	46.454545	53.909091	44.090909	27.545455	50.909091	26.909091	12.636364	76.272727	74.454545	74.545455	27.000000	19.545455	42.363636	38.818182	36.363636	56.636364	21.181818	10.000000	10.545455	46.272727	24.636364	33.636364	44.454545	33.272727	37.000000
10	44.000000	68.545455	37.454545	34.818182	40.818182	77.000000	44.363636	138.818182	218.181818	214.181818	27.909091	48.363636	18.909091	36.909091	46.090909	52.363636	42.727273	26.363636	49.545455	26.909091	12.909091	76.090909	76.363636	80.909091	27.909091	22.454545	46.909091	31.363636	29.545455	48.363636	19.454545	7.181818	9.545455	39.363636	21.727273	29.090909	35.545455	24.818182	28.818182
11	39.500000	67.666667	36.000000	32.916667	40.000000	75.666667	47.250000	143.166667	220.750000	217.750000	26.000000	43.416667	15.583333	34.750000	43.333333	48.916667	40.416667	29.666667	48.000000	25.583333	12.000000	68.333333	60.333333	63.333333	28.833333	19.916667	45.166667	30.083333	28.500000	42.750000	18.000000	7.833333	9.250000	36.833333	20.750000	27.666667	38.750000	24.000000	28.750000
12	29.166667	68.833333	33.416667	27.916667	45.083333	80.750000	39.666667	158.083333	227.416667	229.916667	25.500000	45.083333	14.833333	35.750000	43.500000	53.333333	41.500000	27.500000	48.083333	24.583333	10.000000	69.750000	61.083333	64.083333	28.583333	18.750000	41.333333	37.083333	33.833333	53.416667	20.250000	8.083333	9.916667	40.416667	24.250000	31.416667	39.166667	29.583333	32.583333
13	41.166667	69.166667	36.166667	27.166667	33.083333	69.083333	40.500000	158.166667	226.583333	226.750000	28.250000	49.166667	22.083333	39.416667	46.416667	51.250000	42.833333	26.833333	48.166667	24.750000	9.666667	76.083333	71.750000	72.083333	28.083333	21.583333	47.666667	35.583333	28.916667	48.083333	18.500000	7.666667	10.166667	40.000000	22.583333	28.666667	37.666667	25.666667	31.083333
14	47.833333	69.333333	41.166667	33.416667	41.666667	79.916667	48.750000	148.666667	217.666667	225.583333	23.583333	44.000000	12.000000	32.166667	40.416667	51.250000	41.500000	27.833333	47.666667	25.083333	9.000000	70.500000	63.583333	68.083333	27.750000	23.083333	51.000000	26.750000	24.250000	39.250000	16.333333	8.750000	10.583333	34.916667	19.250000	25.166667	37.750000	19.833333	27.583333
15	40.416667	67.083333	37.416667	36.250000	43.750000	78.000000	51.000000	149.916667	225.000000	230.500000	25.083333	47.666667	15.583333	34.166667	41.500000	48.083333	40.000000	25.500000	45.083333	25.166667	10.916667	76.666667	74.500000	74.916667	28.833333	20.083333	42.833333	35.333333	29.083333	49.166667	19.333333	5.666667	8.250000	38.083333	22.750000	28.916667	36.583333	25.750000	27.083333
16	39.333333	68.500000	38.333333	33.333333	38.083333	82.500000	51.000000	147.333333	223.750000	225.916667	42.833333	67.500000	30.666667	49.500000	68.166667	52.750000	41.500000	26.666667	47.250000	24.916667	9.416667	70.916667	67.250000	71.583333	28.166667	20.083333	39.666667	37.083333	33.250000	51.000000	21.500000	9.250000	10.333333	42.000000	25.916667	32.583333	42.166667	30.250000	35.333333
17	40.583333	70.916667	39.083333	32.750000	37.416667	77.000000	45.333333	152.333333	219.166667	222.750000	40.083333	62.000000	26.583333	44.416667	53.416667	51.666667	42.833333	26.416667	47.250000	24.000000	8.916667	70.166667	67.750000	70.416667	29.333333	20.666667	45.083333	32.583333	31.416667	50.166667	19.833333	6.750000	8.583333	38.000000	22.083333	26.666667	36.500000	24.166667	29.166667
18	39.416667	68.333333	36.916667	31.000000	36.166667	69.833333	45.250000	151.083333	220.500000	229.583333	35.583333	56.000000	20.083333	38.416667	46.416667	47.166667	40.083333	25.250000	44.833333	24.666667	9.250000	66.166667	58.166667	61.083333	27.750000	20.250000	45.250000	30.416667	27.083333	45.416667	18.250000	9.083333	10.083333	39.333333	21.083333	28.583333	37.250000	24.500000	31.333333
19	38.916667	72.583333	36.416667	26.250000	35.166667	68.000000	41.833333	160.416667	230.750000	231.000000	33.666667	50.500000	18.666667	36.083333	44.166667	53.166667	44.083333	28.166667	48.333333	24.416667	8.666667	67.083333	59.083333	65.333333	26.500000	20.000000	41.916667	36.083333	31.416667	53.000000	22.416667	8.416667	9.916667	40.750000	23.666667	32.000000	38.416667	29.916667	31.500000
20	33.166667	71.666667	34.666667	25.750000	33.166667	71.166667	40.500000	157.583333	218.916667	232.083333	34.916667	52.250000	23.083333	40.833333	47.916667	48.666667	41.833333	26.750000	47.916667	24.083333	9.666667	72.583333	69.916667	70.083333	26.750000	21.583333	44.750000	33.750000	31.000000	48.916667	19.250000	6.833333	8.666667	38.000000	21.750000	27.166667	35.333333	25.333333	27.500000
21	44.500000	73.750000	38.833333	26.583333	34.416667	69.166667	40.750000	158.250000	228.000000	225.000000	28.666667	46.833333	13.000000	33.833333	41.250000	56.583333	43.083333	31.083333	50.750000	24.583333	8.833333	70.583333	64.583333	70.083333	28.750000	21.916667	47.083333	27.416667	26.333333	39.083333	17.083333	6.416667	9.166667	33.750000	18.250000	24.416667	35.500000	19.583333	25.500000
22	52.000000	77.083333	39.250000	26.083333	34.000000	69.416667	41.750000	155.000000	230.833333	221.833333	28.416667	47.250000	15.750000	34.083333	41.333333	60.916667	47.583333	33.333333	51.916667	24.416667	8.416667	73.500000	68.750000	72.833333	26.000000	19.166667	41.250000	32.083333	32.083333	50.333333	19.666667	8.583333	9.250000	40.500000	23.416667	29.500000	37.750000	26.500000	32.750000
23	80.416667	69.750000	46.583333	30.416667	35.583333	69.250000	43.250000	149.083333	223.250000	224.916667	30.916667	47.500000	20.416667	37.583333	45.000000	47.083333	41.500000	26.166667	45.583333	24.333333	9.500000	73.750000	68.833333	73.333333	25.916667	19.416667	42.250000	36.750000	34.083333	54.666667	21.666667	9.833333	10.416667	43.166667	24.666667	32.416667	39.666667	29.500000	37.000000
24	42.500000	71.416667	38.083333	35.250000	39.416667	78.500000	48.333333	151.750000	218.500000	233.083333	29.500000	49.166667	18.916667	37.083333	44.583333	51.333333	41.833333	26.750000	49.416667	24.416667	9.416667	72.416667	65.166667	67.833333	29.083333	21.333333	43.500000	34.000000	31.333333	49.583333	20.416667	6.000000	9.000000	36.000000	23.500000	28.916667	33.583333	22.583333	28.750000
25	49.916667	74.250000	41.500000	31.333333	37.583333	69.666667	40.916667	145.916667	221.000000	225.166667	27.250000	45.166667	16.000000	35.333333	43.750000	57.333333	45.916667	28.583333	50.416667	25.166667	9.083333	66.583333	58.583333	62.000000	26.583333	20.333333	43.500000	29.750000	27.750000	43.833333	18.416667	6.500000	8.583333	35.500000	20.916667	25.166667	34.416667	22.416667	27.666667
26	45.500000	71.833333	38.750000	25.666667	35.333333	63.583333	37.666667	155.000000	228.500000	232.666667	42.000000	65.916667	27.666667	46.416667	65.666667	54.583333	43.916667	28.083333	48.500000	24.333333	8.916667	70.166667	61.250000	67.666667	24.500000	19.166667	40.833333	35.833333	34.000000	54.000000	21.666667	8.500000	9.583333	41.250000	23.416667	30.000000	40.333333	29.500000	33.916667
27	47.818182	73.000000	41.000000	23.818182	31.909091	60.090909	37.272727	163.000000	229.181818	229.727273	38.909091	64.000000	27.636364	43.909091	55.272727	54.818182	44.272727	27.454545	48.909091	24.818182	9.363636	74.000000	74.545455	75.363636	31.090909	22.818182	45.545455	31.272727	30.363636	50.545455	19.090909	7.181818	9.090909	38.000000	21.181818	26.454545	35.272727	23.545455	29.454545
28	32.545455	71.636364	34.272727	29.090909	34.454545	67.000000	40.636364	155.363636	224.272727	223.636364	34.000000	56.000000	17.818182	36.636364	44.090909	55.454545	43.909091	32.272727	50.636364	25.454545	10.545455	73.818182	67.363636	69.636364	28.090909	21.000000	48.272727	28.818182	27.545455	44.272727	17.000000	7.363636	9.181818	36.636364	18.909091	24.636364	37.909091	22.000000	28.454545
29	39.900000	76.000000	37.400000	23.500000	28.500000	62.000000	34.300000	152.500000	230.900000	234.000000	34.300000	53.100000	20.000000	38.500000	45.600000	61.200000	48.900000	29.300000	51.800000	24.600000	9.200000	75.300000	71.800000	71.500000	26.800000	18.600000	40.700000	38.500000	36.200000	54.100000	21.600000	6.800000	8.400000	43.900000	25.000000	30.700000	37.400000	28.800000	30.500000
30	48.400000	79.300000	40.100000	27.300000	31.100000	61.300000	37.500000	155.000000	223.600000	225.800000	34.000000	53.600000	24.400000	39.100000	48.500000	63.500000	50.200000	32.700000	53.900000	24.900000	9.000000	70.700000	63.100000	68.400000	27.300000	21.100000	41.500000	35.400000	31.700000	50.500000	20.300000	9.100000	9.700000	39.900000	22.900000	31.500000	38.200000	26.100000	32.700000
31	69.166667	75.333333	46.666667	38.500000	39.333333	80.833333	52.166667	139.833333	213.500000	212.333333	29.833333	45.833333	13.000000	34.000000	40.000000	60.500000	49.666667	29.333333	50.333333	25.333333	9.666667	61.666667	53.500000	61.333333	29.166667	25.166667	48.000000	22.333333	21.500000	34.500000	15.333333	6.666667	8.166667	29.833333	17.166667	18.333333	33.500000	15.666667	23.166667

テストデータに似ている訓練データを探す

# Adversarial Validation
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
train = train.drop(target_columns, axis=1)

train = preprocessing(train, mode='train')
test = preprocessing(test, mode='test')

train['target'] = 0
test['target'] = 1
all_df = pd.concat([train, test], axis=0)
target = all_df['target'].values

train_, test_ = train_test_split(all_df, test_size=0.33, random_state=42, shuffle=True)
train_x = train_.drop(['id', 'date', 'time', 'year', 'month', 'target'],axis=1)
test_x = test_.drop(['id', 'date', 'time', 'year', 'month', 'target'],axis=1)
train_y = train_['target'].values
test_y = test_['target'].values

lgb_train = lgb.Dataset(train_x, label=train_y)
lgb_test = lgb.Dataset(test_x, label=test_y)

param = {'metric': 'auc',
         'seed': seed,
         'verbosity': -1}

num_round = 100
clf = lgb.train(param, lgb_train, num_round, valid_sets = [lgb_test], verbose_eval=50, early_stopping_rounds = 50)

feature_imp = pd.DataFrame(sorted(zip(clf.feature_importance(), train_x.columns)), columns=['Value','Feature'])
plt.figure(figsize=(6, 5))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False).head(500))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

display(feature_imp.sort_values(by="Value", ascending=False))
print(np.sort(clf.predict(train_x))[::-1][:30])
print(np.argsort(clf.predict(train_x))[::-1][:30])

# 似ている期間も特になし

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.819469
Early stopping, best iteration is:
[14]	valid_0's auc: 0.891593

	Value	Feature
4	24	lowest
3	22	highest
2	17	day
1	5	weekday
0	1	rain

[0.24907411 0.24907411 0.24907411 0.24297665 0.24297665 0.24297665
 0.24054652 0.23205524 0.23205524 0.23205524 0.23205524 0.23205524
 0.23205524 0.23205524 0.22962511 0.2220307  0.21542067 0.17226214
 0.1697324  0.1697324  0.1697324  0.1697324  0.16401879 0.15614695
 0.15614695 0.15309783 0.15309783 0.14594867 0.13259713 0.13259713]
[178 240 151  79   5 138  54 123 184 193  92 172 216 133  89  42  72  74
 119 229 162  29 171  70 209  69 201 205 112 152]

MODEL

# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)

FE

def preprocessing(df, mode='train'):
    df_tmp = df.copy()
    input_year = 2018

    df_tmp['time'] = pd.to_datetime(df_tmp.date, format='%m/%d')
    df_tmp['year'] = df_tmp['time'].dt.year
    df_tmp['month'] = df_tmp['time'].dt.month
    df_tmp['day'] = df_tmp['time'].dt.day
    if mode=='train':
        df_tmp.loc[df_tmp['month']>3, 'year'] = input_year
        df_tmp.loc[df_tmp['month']<=3, 'year'] = input_year + 1
    else:
        df_tmp['year'] = input_year + 1
    df_tmp['time'] = pd.to_datetime({'year':df_tmp.year, 'month':df_tmp.month, 'day':df_tmp.day})
    df_tmp['weekday'] = df_tmp['time'].dt.weekday
    return df_tmp

train = preprocessing(train, mode='train')
test = preprocessing(test, mode='test')

train.columns

Index(['id', 'date', 'highest', 'lowest', 'rain', 'ice1', 'ice2', 'ice3',
       'oden1', 'oden2', 'oden3', 'oden4', 'hot1', 'hot2', 'hot3', 'dessert1',
       'dessert2', 'dessert3', 'dessert4', 'dessert5', 'drink1', 'drink2',
       'drink3', 'drink4', 'drink5', 'drink6', 'alcol1', 'alcol2', 'alcol3',
       'snack1', 'snack2', 'snack3', 'bento1', 'bento2', 'bento3', 'bento4',
       'tild1', 'tild2', 'men1', 'men2', 'men3', 'men4', 'men5', 'men6',
       'time', 'year', 'month', 'day', 'weekday'],
      dtype='object')

Run

# 検証データのindexを指定
valid_index = range(297,351) # month:2,3

# 予測結果を保存する辞書型データを作成
results = dict({})
all_lgb_score = []
# 商品毎の予測を作成する
for c in tqdm(target_columns):
    # 商品特有の性質を反映させた特徴量を生成
    train_tmp = train.copy()
    test_tmp = test.copy()
    # ice
    if c in target_columns[0:3]:
        train_tmp = train_tmp[~train_tmp['month'].isin([7,8,9])]
        train_tmp['is_wday0'] = train['weekday'].isin([0]).astype(int)
        test_tmp['is_wday0'] = test['weekday'].isin([0]).astype(int)
    # oden
    elif c in target_columns[3:7]:
        train_tmp = train_tmp[(train_tmp['month'].isin([10,11,12,1,2,3]))|(train_tmp['id'].isin(valid_index))]
        train_tmp['is_wday23'] = train['weekday'].isin([2,3]).astype(int)
        test_tmp['is_wday23'] = test['weekday'].isin([2,3]).astype(int)
    # hot
    elif c in target_columns[7:10]:
        train_tmp['is_wday034'] = train['weekday'].isin([0,3,4]).astype(int)
        test_tmp['is_wday034'] = test['weekday'].isin([0,3,4]).astype(int)
    # dessert
    elif c in target_columns[10:15]:
        train_tmp['is_wday26'] = train['weekday'].isin([2,6]).astype(int)
        test_tmp['is_wday26'] = test['weekday'].isin([2,6]).astype(int)
    # drink145
    elif c in [target_columns[15],target_columns[18],target_columns[19]]:
        train_tmp['is_wday0'] = train['weekday'].isin([0]).astype(int)
        test_tmp['is_wday0'] = test['weekday'].isin([0]).astype(int)
    # drink23
    elif c in target_columns[16:18]:
        train_tmp['is_wday4'] = train['weekday'].isin([4]).astype(int)
        test_tmp['is_wday4'] = test['weekday'].isin([4]).astype(int)
    # snack
    elif c in target_columns[24:27]:
        train_tmp['is_wday4'] = train['weekday'].isin([4]).astype(int)
        train_tmp['is_wday15'] = train['weekday'].isin([1,5]).astype(int)
        test_tmp['is_wday4'] = test['weekday'].isin([4]).astype(int)
        test_tmp['is_wday15'] = test['weekday'].isin([1,5]).astype(int)
    # bento
    elif c in target_columns[27:31]:
        train_tmp['is_wday034'] = train['weekday'].isin([0,3,4]).astype(int)
        test_tmp['is_wday034'] = test['weekday'].isin([0,3,4]).astype(int)
    # tild
    elif c in target_columns[31:33]:
        train_tmp['is_wday2'] = train['weekday'].isin([2]).astype(int)
        test_tmp['is_wday2'] = test['weekday'].isin([2]).astype(int)
    # men
    elif c in target_columns[33:39]:
        train_tmp['is_wday034'] = train['weekday'].isin([0,3,4]).astype(int)
        test_tmp['is_wday034'] = test['weekday'].isin([0,3,4]).astype(int)

    train_columns = [c for c in train_tmp.columns if c not in target_columns if c not in ['date', 'time']]

    x_train = train_tmp[~train_tmp['id'].isin(valid_index)][train_columns]
    y_train = train_tmp[~train_tmp['id'].isin(valid_index)][c]
    x_valid = train_tmp[train_tmp['id'].isin(valid_index)][train_columns]
    y_valid = train_tmp[train_tmp['id'].isin(valid_index)][c]
    x_test = test_tmp[train_columns]

    x_train = x_train.drop(['id'],axis=1)
    x_valid = x_valid.drop(['id'],axis=1) 
    x_test = x_test.drop(['id'],axis=1)

    # 分位点を設定
    qs = np.array([0.01, 0.1, 0.5, 0.9, 0.99])
    
    lgb_scores = []
    # 分位点毎に予測を作成する
    for q in qs:
        # モデルのインスタンスを作成
        lgb = LGBMRegressor(
            objective='quantile',
            alpha=q,
            n_estimators=10000,
            max_depth=2,
            colsample_bytree=0.9,
            random_state=seed)
        #学習を実施
        lgb.fit(x_train, y_train, eval_set=(x_valid, y_valid), early_stopping_rounds=100, verbose=False)
        #予測を実施
        lgb_scores.append(lgb.best_score_['valid_0']['quantile'])  # validationのbest_score
        pred_y = lgb.predict(x_test)
        #予測結果を格納
        results[(c, q)] = pred_y

    quantiles = [0.01, 0.1, 0.5, 0.9, 0.99]
    lgb_scores = []
    for i, q in enumerate(quantiles):
        lgb = LGBMRegressor(
            objective='quantile',
            alpha=q,
            n_estimators=10000,
            max_depth=2,
            colsample_bytree=0.9,
            random_state=seed)
        lgb.fit(x_train, y_train, eval_set=(x_valid, y_valid), early_stopping_rounds=100, verbose=False)
        lgb_scores.append(lgb.best_score_['valid_0']['quantile'])  # validationのbest_score
    all_lgb_score.append(lgb_scores)
# Score
print(np.array(all_lgb_score).mean())

  0%|          | 0/39 [00:00<?, ?it/s]

0.8462863293289724

Submit

# 商品_分位点毎のリストに変換
submit_rows = [[f'{k[0]}_{k[1]}']+ v.tolist() for k, v in results.items()]

# テスト結果の出力
submit_df = pd.DataFrame(np.array(submit_rows)[:, 1:22].astype(float).round(), index=np.array(submit_rows)[:, 0])
submit_df.columns = list(range(1, 22))
submit_df = submit_df.transpose()
submit_df.index.name = 'id'
submit_df.to_csv(os.path.join(OUTPUT, "sub_exp00.csv"))
display(submit_df.head())

	ice1_0.01	ice1_0.1	ice1_0.5	ice1_0.9	ice1_0.99	ice2_0.01	ice2_0.1	ice2_0.5	ice2_0.9	ice2_0.99	ice3_0.01	ice3_0.1	ice3_0.5	ice3_0.9	ice3_0.99	oden1_0.01	oden1_0.1	oden1_0.5	oden1_0.9	oden1_0.99	oden2_0.01	oden2_0.1	oden2_0.5	oden2_0.9	oden2_0.99	oden3_0.01	oden3_0.1	oden3_0.5	oden3_0.9	oden3_0.99	oden4_0.01	oden4_0.1	oden4_0.5	oden4_0.9	oden4_0.99	hot1_0.01	hot1_0.1	hot1_0.5	hot1_0.9	hot1_0.99	hot2_0.01	hot2_0.1	hot2_0.5	hot2_0.9	hot2_0.99	hot3_0.01	hot3_0.1	hot3_0.5	hot3_0.9	hot3_0.99	dessert1_0.01	dessert1_0.1	dessert1_0.5	dessert1_0.9	dessert1_0.99	dessert2_0.01	dessert2_0.1	dessert2_0.5	dessert2_0.9	dessert2_0.99	dessert3_0.01	dessert3_0.1	dessert3_0.5	dessert3_0.9	dessert3_0.99	dessert4_0.01	dessert4_0.1	dessert4_0.5	dessert4_0.9	dessert4_0.99	dessert5_0.01	dessert5_0.1	dessert5_0.5	dessert5_0.9	dessert5_0.99	drink1_0.01	drink1_0.1	drink1_0.5	drink1_0.9	drink1_0.99	drink2_0.01	drink2_0.1	drink2_0.5	drink2_0.9	drink2_0.99	drink3_0.01	drink3_0.1	drink3_0.5	drink3_0.9	drink3_0.99	drink4_0.01	drink4_0.1	drink4_0.5	drink4_0.9	drink4_0.99	drink5_0.01	drink5_0.1	drink5_0.5	drink5_0.9	drink5_0.99	drink6_0.01	drink6_0.1	drink6_0.5	drink6_0.9	drink6_0.99	alcol1_0.01	alcol1_0.1	alcol1_0.5	alcol1_0.9	alcol1_0.99	alcol2_0.01	alcol2_0.1	alcol2_0.5	alcol2_0.9	alcol2_0.99	alcol3_0.01	alcol3_0.1	alcol3_0.5	alcol3_0.9	alcol3_0.99	snack1_0.01	snack1_0.1	snack1_0.5	snack1_0.9	snack1_0.99	snack2_0.01	snack2_0.1	snack2_0.5	snack2_0.9	snack2_0.99	snack3_0.01	snack3_0.1	snack3_0.5	snack3_0.9	snack3_0.99	bento1_0.01	bento1_0.1	bento1_0.5	bento1_0.9	bento1_0.99	bento2_0.01	bento2_0.1	bento2_0.5	bento2_0.9	bento2_0.99	bento3_0.01	bento3_0.1	bento3_0.5	bento3_0.9	bento3_0.99	bento4_0.01	bento4_0.1	bento4_0.5	bento4_0.9	bento4_0.99	tild1_0.01	tild1_0.1	tild1_0.5	tild1_0.9	tild1_0.99	tild2_0.01	tild2_0.1	tild2_0.5	tild2_0.9	tild2_0.99	men1_0.01	men1_0.1	men1_0.5	men1_0.9	men1_0.99	men2_0.01	men2_0.1	men2_0.5	men2_0.9	men2_0.99	men3_0.01	men3_0.1	men3_0.5	men3_0.9	men3_0.99	men4_0.01	men4_0.1	men4_0.5	men4_0.9	men4_0.99	men5_0.01	men5_0.1	men5_0.5	men5_0.9	men5_0.99	men6_0.01	men6_0.1	men6_0.5	men6_0.9	men6_0.99
id
1	16.0	19.0	20.0	21.0	28.0	51.0	60.0	60.0	63.0	90.0	24.0	26.0	30.0	33.0	38.0	10.0	34.0	45.0	52.0	103.0	46.0	45.0	59.0	58.0	129.0	82.0	91.0	108.0	150.0	249.0	55.0	59.0	63.0	68.0	147.0	88.0	139.0	158.0	174.0	217.0	178.0	216.0	219.0	234.0	269.0	214.0	223.0	236.0	257.0	281.0	20.0	49.0	53.0	53.0	57.0	46.0	67.0	78.0	81.0	78.0	11.0	15.0	55.0	62.0	58.0	25.0	53.0	62.0	65.0	66.0	56.0	67.0	73.0	81.0	87.0	28.0	29.0	36.0	40.0	68.0	18.0	28.0	29.0	34.0	49.0	12.0	18.0	22.0	23.0	33.0	30.0	36.0	37.0	40.0	49.0	22.0	22.0	24.0	26.0	30.0	7.0	7.0	8.0	8.0	16.0	54.0	56.0	61.0	64.0	76.0	31.0	45.0	47.0	52.0	75.0	52.0	54.0	57.0	61.0	66.0	18.0	22.0	27.0	32.0	40.0	9.0	17.0	21.0	24.0	37.0	26.0	37.0	43.0	53.0	59.0	23.0	44.0	51.0	54.0	74.0	31.0	37.0	43.0	48.0	49.0	15.0	62.0	70.0	78.0	81.0	8.0	24.0	28.0	30.0	33.0	4.0	11.0	13.0	20.0	21.0	8.0	10.0	11.0	16.0	18.0	26.0	43.0	50.0	63.0	66.0	21.0	24.0	30.0	34.0	35.0	27.0	31.0	37.0	45.0	54.0	26.0	33.0	40.0	48.0	59.0	30.0	32.0	41.0	46.0	51.0	13.0	31.0	37.0	51.0	61.0
2	16.0	18.0	20.0	21.0	29.0	51.0	54.0	57.0	58.0	90.0	24.0	26.0	30.0	32.0	38.0	10.0	14.0	25.0	48.0	103.0	27.0	27.0	37.0	50.0	129.0	63.0	65.0	69.0	143.0	249.0	37.0	41.0	54.0	56.0	147.0	88.0	138.0	147.0	159.0	212.0	178.0	211.0	225.0	228.0	256.0	202.0	206.0	222.0	238.0	281.0	20.0	29.0	31.0	33.0	39.0	37.0	49.0	54.0	55.0	66.0	5.0	5.0	10.0	10.0	18.0	25.0	28.0	31.0	35.0	48.0	32.0	38.0	38.0	46.0	65.0	24.0	23.0	27.0	30.0	69.0	18.0	26.0	28.0	29.0	49.0	12.0	15.0	20.0	25.0	33.0	30.0	30.0	33.0	39.0	49.0	22.0	22.0	23.0	26.0	30.0	7.0	7.0	8.0	8.0	16.0	64.0	66.0	73.0	76.0	86.0	31.0	57.0	60.0	67.0	77.0	62.0	63.0	69.0	78.0	90.0	18.0	29.0	35.0	38.0	40.0	9.0	27.0	32.0	36.0	37.0	26.0	43.0	52.0	60.0	65.0	8.0	9.0	10.0	12.0	31.0	11.0	12.0	14.0	18.0	22.0	15.0	17.0	21.0	23.0	24.0	6.0	7.0	8.0	10.0	14.0	4.0	4.0	5.0	15.0	16.0	7.0	7.0	9.0	13.0	18.0	16.0	18.0	21.0	30.0	47.0	8.0	8.0	10.0	15.0	16.0	11.0	11.0	14.0	22.0	54.0	24.0	26.0	30.0	41.0	59.0	3.0	4.0	4.0	11.0	16.0	13.0	14.0	16.0	36.0	61.0
3	14.0	15.0	16.0	18.0	29.0	25.0	34.0	38.0	39.0	90.0	24.0	25.0	29.0	32.0	38.0	10.0	48.0	49.0	54.0	103.0	53.0	63.0	56.0	60.0	129.0	125.0	129.0	112.0	184.0	372.0	68.0	70.0	69.0	80.0	147.0	84.0	125.0	133.0	149.0	212.0	178.0	198.0	208.0	220.0	256.0	195.0	206.0	220.0	232.0	257.0	20.0	25.0	28.0	31.0	38.0	37.0	43.0	47.0	51.0	66.0	5.0	5.0	7.0	8.0	18.0	25.0	27.0	28.0	33.0	48.0	32.0	34.0	34.0	46.0	65.0	16.0	18.0	19.0	22.0	70.0	17.0	22.0	23.0	28.0	54.0	11.0	14.0	15.0	39.0	53.0	27.0	28.0	30.0	33.0	49.0	22.0	23.0	24.0	30.0	33.0	7.0	8.0	9.0	19.0	22.0	73.0	74.0	79.0	85.0	89.0	31.0	69.0	72.0	81.0	83.0	70.0	74.0	78.0	84.0	90.0	20.0	27.0	30.0	37.0	44.0	9.0	22.0	25.0	27.0	37.0	26.0	44.0	49.0	60.0	65.0	8.0	9.0	10.0	12.0	28.0	11.0	12.0	14.0	18.0	22.0	16.0	16.0	21.0	23.0	22.0	6.0	7.0	8.0	10.0	14.0	4.0	4.0	5.0	15.0	20.0	6.0	7.0	9.0	13.0	18.0	17.0	18.0	21.0	30.0	57.0	9.0	9.0	10.0	14.0	16.0	11.0	11.0	14.0	22.0	54.0	24.0	27.0	30.0	41.0	59.0	3.0	4.0	4.0	8.0	16.0	13.0	14.0	16.0	34.0	61.0
4	16.0	16.0	17.0	19.0	29.0	39.0	41.0	42.0	43.0	90.0	24.0	25.0	28.0	32.0	38.0	10.0	38.0	44.0	54.0	103.0	51.0	50.0	57.0	60.0	129.0	97.0	99.0	112.0	176.0	250.0	60.0	64.0	67.0	78.0	147.0	88.0	132.0	153.0	165.0	216.0	178.0	215.0	224.0	231.0	268.0	208.0	225.0	238.0	251.0	257.0	20.0	25.0	28.0	32.0	39.0	37.0	43.0	47.0	51.0	67.0	5.0	5.0	7.0	8.0	18.0	25.0	27.0	30.0	33.0	48.0	32.0	34.0	35.0	46.0	65.0	14.0	14.0	14.0	21.0	69.0	18.0	19.0	19.0	25.0	56.0	11.0	11.0	11.0	16.0	29.0	27.0	28.0	30.0	33.0	48.0	23.0	24.0	25.0	27.0	30.0	8.0	10.0	11.0	11.0	16.0	73.0	83.0	89.0	98.0	98.0	31.0	86.0	92.0	103.0	110.0	70.0	84.0	90.0	98.0	98.0	17.0	17.0	22.0	28.0	42.0	9.0	10.0	11.0	13.0	37.0	26.0	32.0	35.0	45.0	65.0	23.0	48.0	51.0	54.0	71.0	35.0	37.0	45.0	49.0	56.0	15.0	61.0	71.0	75.0	77.0	8.0	26.0	28.0	31.0	33.0	4.0	4.0	5.0	9.0	18.0	7.0	7.0	8.0	9.0	18.0	26.0	45.0	49.0	56.0	77.0	22.0	25.0	30.0	35.0	34.0	27.0	33.0	38.0	41.0	54.0	26.0	34.0	40.0	43.0	59.0	30.0	35.0	40.0	39.0	50.0	13.0	31.0	36.0	38.0	63.0
5	16.0	17.0	18.0	20.0	29.0	49.0	50.0	52.0	54.0	90.0	24.0	26.0	30.0	32.0	38.0	10.0	33.0	42.0	52.0	103.0	46.0	48.0	57.0	60.0	129.0	82.0	91.0	112.0	157.0	250.0	59.0	62.0	68.0	74.0	147.0	88.0	136.0	155.0	171.0	216.0	178.0	214.0	221.0	235.0	267.0	214.0	223.0	237.0	256.0	281.0	20.0	45.0	47.0	51.0	57.0	46.0	60.0	67.0	71.0	84.0	11.0	13.0	48.0	52.0	60.0	25.0	52.0	59.0	62.0	66.0	56.0	62.0	66.0	69.0	83.0	21.0	16.0	19.0	25.0	69.0	18.0	20.0	22.0	25.0	56.0	12.0	12.0	15.0	17.0	32.0	30.0	30.0	32.0	34.0	42.0	23.0	24.0	25.0	27.0	30.0	8.0	9.0	9.0	10.0	16.0	72.0	90.0	98.0	110.0	110.0	31.0	96.0	103.0	115.0	115.0	70.0	93.0	99.0	111.0	114.0	18.0	23.0	25.0	33.0	40.0	9.0	18.0	20.0	22.0	37.0	26.0	38.0	44.0	54.0	65.0	23.0	47.0	51.0	54.0	74.0	34.0	37.0	44.0	49.0	52.0	15.0	63.0	70.0	75.0	77.0	8.0	26.0	28.0	30.0	33.0	4.0	4.0	5.0	9.0	18.0	7.0	7.0	8.0	9.0	18.0	26.0	45.0	49.0	56.0	71.0	21.0	27.0	30.0	36.0	34.0	27.0	33.0	38.0	43.0	54.0	26.0	34.0	40.0	44.0	59.0	30.0	35.0	40.0	41.0	45.0	13.0	31.0	36.0	40.0	63.0

目視確認

pred_median_col = [c for c in submit_df.columns if '_0.5' in c]
test_add_pred = test.merge(submit_df[pred_median_col].reset_index(), on='id', how='left')
test_add_pred.columns = [c.replace('_0.5', '') if '_0.5' in c else c for c in test_add_pred.columns]
test_add_pred[target_columns] = test_add_pred[target_columns].astype(float)
test_add_pred.head(3)

	id	date	highest	lowest	time	year	month	day	weekday	ice1	ice2	ice3	oden1	oden2	oden3	oden4	hot1	hot2	hot3	dessert1	dessert2	dessert3	dessert4	dessert5	drink1	drink2	drink3	drink4	drink5	drink6	alcol1	alcol2	alcol3	snack1	snack2	snack3	bento1	bento2	bento3	bento4	tild1	tild2	men1	men2	men3	men4	men5	men6
0	1	3/27	19.7	7.3	2019-03-27	2019	3	27	2	20.0	60.0	30.0	45.0	59.0	108.0	63.0	158.0	219.0	236.0	53.0	78.0	55.0	62.0	73.0	36.0	29.0	22.0	37.0	24.0	8.0	61.0	47.0	57.0	27.0	21.0	43.0	51.0	43.0	70.0	28.0	13.0	11.0	50.0	30.0	37.0	40.0	41.0	37.0
1	2	3/28	16.9	9.0	2019-03-28	2019	3	28	3	20.0	57.0	30.0	25.0	37.0	69.0	54.0	147.0	225.0	222.0	31.0	54.0	10.0	31.0	38.0	27.0	28.0	20.0	33.0	23.0	8.0	73.0	60.0	69.0	35.0	32.0	52.0	10.0	14.0	21.0	8.0	5.0	9.0	21.0	10.0	14.0	30.0	4.0	16.0
2	3	3/29	9.3	6.8	2019-03-29	2019	3	29	4	16.0	38.0	29.0	49.0	56.0	112.0	69.0	133.0	208.0	220.0	28.0	47.0	7.0	28.0	34.0	19.0	23.0	15.0	30.0	24.0	9.0	79.0	72.0	78.0	30.0	25.0	49.0	10.0	14.0	21.0	8.0	5.0	9.0	21.0	10.0	14.0	30.0	4.0	16.0

# 移動平均
window=7
train_rolling = train.rolling(window, min_periods=1).mean()
test_rolling = test_add_pred.rolling(window, min_periods=1).mean()
all_rolling = pd.concat([train, test_add_pred], axis=0).reset_index(drop=True).rolling(window, min_periods=1).mean()

# 目視確認
plot_col = [c for c in train.columns if c not in ['id', 'date', 'time', 'year', 'month', 'day', 'weekday']]
ncols = len(plot_col) // 13
plt.subplots(14, ncols, sharey=True, sharex=True, figsize=(30, 80))
for i, col in enumerate(plot_col):
    plt.subplot(14, ncols, i+1)
    plt.plot(train_rolling.index[window:], train_rolling[col][window:], alpha=1, color='blue', label='train')
    plt.plot(all_rolling.index[-len(test_rolling):], all_rolling[col][-len(test_rolling):], alpha=1, color='red', label='test')
    for x in [20,51,81,112,143,173.204,234,265,296,324,350]:
        plt.axvline(x)
    plt.xlabel(col)
    plt.legend()
    plt.xticks([])
plt.show()

Baseline

sub_base19
MODEL

MODEL = LightGBMReggressor(objective='quantile')
seed = 42
year = 2018
n_estimators=10000
max_depth=2
colsample_bytree=0.9

FE

validation : month2,3
ice: del month7-9, add is_wday0
oden: del month7-9, add is_wday23
hot: add is_wday034
dessert: add is_wday26
drink145: add is_wday0
drink23: add is_wday4
snack: add is_wday4, is_wday15
bento: add is_wday034
tild: add is_wday2
men: add is_wday034

Post-Processing

change integer

Score: 0.8462863293289724

second sub_base17

add quarter

Library & Data

EDA

MODEL

FE

Run

Submit

目視確認

Baseline

添付データ

Sard

kotrying

Sard

new user