EDA & Model

Library & Data

# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm
import statsmodels
import statsmodels.api as sm

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_pinball_loss
from lightgbm import LGBMRegressor

import warnings
warnings.simplefilter('ignore')

# mount
from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive')
Mounted at /content/drive

構成

MyDrive
├<weather_merchandising>
   ├<notebook>
   │ └run.ipynb
   ├<data>
   │ ├train_data.csv
   │ ├submission.csv
   │ └test_data.csv
   └<output>
# Config
DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/weather_merchandising"
INPUT = os.path.join(DRIVE_PATH, "data")
OUTPUT = os.path.join(DRIVE_PATH, "output")

TRAIN_FILE = os.path.join(INPUT, "train_data.csv")
TEST_FILE = os.path.join(INPUT, "test_data.csv")
SUB_FILE = os.path.join(INPUT, "submission.csv")

seed =42

# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'
# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)
# Target Columns
target_columns = ['ice1', 'ice2', 'ice3', 'oden1', 'oden2', 'oden3', 'oden4', 'hot1',
       'hot2', 'hot3', 'dessert1', 'dessert2', 'dessert3', 'dessert4',
       'dessert5', 'drink1', 'drink2', 'drink3', 'drink4', 'drink5', 'drink6',
       'alcol1', 'alcol2', 'alcol3', 'snack1', 'snack2', 'snack3', 'bento1',
       'bento2', 'bento3', 'bento4', 'tild1', 'tild2', 'men1', 'men2', 'men3',
       'men4', 'men5', 'men6']

EDA

def preprocessing(df, mode='train'):
        df_tmp = df.copy()
        input_year = 2018

        df_tmp['time'] = pd.to_datetime(df_tmp.date, format='%m/%d')
        df_tmp['year'] = df_tmp['time'].dt.year
        df_tmp['month'] = df_tmp['time'].dt.month
        df_tmp['day'] = df_tmp['time'].dt.day
        if mode=='train':
            df_tmp.loc[df_tmp['month']>3, 'year'] = input_year
            df_tmp.loc[df_tmp['month']<=3, 'year'] = input_year + 1
        else:
            df_tmp['year'] = input_year + 1
        df_tmp['time'] = pd.to_datetime({'year':df_tmp.year, 'month':df_tmp.month, 'day':df_tmp.day})
        df_tmp['weekday'] = df_tmp['time'].dt.weekday
        return df_tmp

train_df = preprocessing(train, mode='train')
test_df = preprocessing(test, mode='test')
all_df = pd.concat([train_df, test_df]).reset_index(drop=True)

display(train_df.head(3))
display(test_df.head(3))
id date highest lowest rain ice1 ice2 ice3 oden1 oden2 oden3 oden4 hot1 hot2 hot3 dessert1 dessert2 dessert3 dessert4 dessert5 drink1 drink2 drink3 drink4 drink5 drink6 alcol1 alcol2 alcol3 snack1 snack2 snack3 bento1 bento2 bento3 bento4 tild1 tild2 men1 men2 men3 men4 men5 men6 time year month day weekday
0 1 4/11 21.9 12.4 0.0 25 72 26 10 23 52 35 180 254 270 42 58 50 59 67 54 45 28 49 22 8 63 51 59 26 21 35 56 46 70 27 12 12 57 30 41 38 37 35 2018-04-11 2018 4 11 2
1 2 4/12 25.9 13.9 0.0 30 85 33 9 18 42 26 202 219 235 22 36 5 28 37 69 54 35 58 22 9 77 66 72 36 32 63 8 14 23 9 5 8 19 9 13 26 4 16 2018-04-12 2018 4 12 3
2 3 4/13 20.9 11.9 0.0 21 68 28 12 22 57 31 164 210 223 20 41 5 30 32 46 38 24 45 26 9 81 69 74 36 25 57 9 12 19 6 4 9 23 9 11 33 4 13 2018-04-13 2018 4 13 4
id date highest lowest rain time year month day weekday
0 1 3/27 19.7 7.3 0.0 2019-03-27 2019 3 27 2
1 2 3/28 16.9 9.0 0.0 2019-03-28 2019 3 28 3
2 3 3/29 9.3 6.8 0.0 2019-03-29 2019 3 29 4
plot_col = [c for c in train.columns if c not in ['id', 'date', 'time', 'year', 'month', 'day', 'weekday']]
ncols = len(plot_col) // 13
plt.subplots(14, ncols, sharey=True, sharex=True, figsize=(30, 80))
for i, col in enumerate(plot_col):
    plt.subplot(14, ncols, i+1)
    plt.plot(train_df.date, train_df[col], alpha=1, color='orange', label=col)
    for x in [20,51,81,112,143,173.204,234,265,296,324,350]: # beginning of month, last line is train/test split
        plt.axvline(x)
    plt.xlabel(col)
    plt.legend()
    plt.xticks([])
plt.show()

# アイスは夏の分布が大きく異なるものがある>学習から除外
# おでんは7月から9月は提供していない>学習から除外
# ホットは周期性と小さい下降トレンド
# アイスやドリンクは気温とかなり相関があるが、ドリンクは売り上げの下限値がありそう
# 横ばいのものにも曜日の周期性は見られる