LightGBM Baseline (LB: 22.54002)

LightGBM Baseline (LB: 22.54002)

タクシー需要予測のLGBMを使用したサンプルです。

データとしては train_data.csv のみを使用しており「地区データ」「気象データ」は使用しておりません。

* Google Colabで動作可、GPUは不要となります

import lightgbm as lgb
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import random
import datetime
import os
# 設定値
input_path = "/gdrive/MyDrive/Colab Notebooks/ProbSpace/taxi_demand/input"
num_estimators = 1000
seed = 1111
output_file = 'output.csv'
# 乱数初期化
random.seed(seed)
np.random.seed(seed)
# データ読み込み
train_df = pd.read_csv(os.path.join(input_path, 'train_data.csv'), index_col='tpep_pickup_datetime')
train_df.index = pd.to_datetime(train_df.index)
train_df.head()
0 1 2 3 4 5 6 7 8 9 ... 69 70 71 72 73 74 75 76 77 78
tpep_pickup_datetime
2017-01-01 00:00:00 53.0 16.0 45.0 38.0 12.0 6.0 2.0 47.0 31.0 238.0 ... 260.0 12.0 139.0 253.0 17.0 33.0 5.0 14.0 92.0 201.0
2017-01-01 00:30:00 83.0 62.0 59.0 56.0 19.0 26.0 8.0 91.0 48.0 165.0 ... 357.0 45.0 159.0 280.0 46.0 50.0 10.0 30.0 110.0 349.0
2017-01-01 01:00:00 69.0 83.0 58.0 45.0 21.0 27.0 23.0 102.0 62.0 113.0 ... 355.0 74.0 193.0 232.0 71.0 54.0 10.0 34.0 124.0 386.0
2017-01-01 01:30:00 76.0 87.0 56.0 36.0 27.0 18.0 25.0 84.0 87.0 81.0 ... 328.0 61.0 137.0 208.0 68.0 75.0 12.0 44.0 91.0 373.0
2017-01-01 02:00:00 101.0 113.0 43.0 33.0 32.0 24.0 14.0 88.0 69.0 68.0 ... 320.0 63.0 125.0 179.0 87.0 78.0 22.0 53.0 85.0 341.0

5 rows × 79 columns

# テストデータ(結果データ)の雛形を生成
test_df = pd.DataFrame(None,
                      index = pd.date_range('2019-12-01 00:00:00', '2019-12-07 23:30:00', freq='30T'),
                      columns=[f'{n}' for n in range(0,79)])
test_df.index.name = 'tpep_pickup_datetime'
test_df.head()
0 1 2 3 4 5 6 7 8 9 ... 69 70 71 72 73 74 75 76 77 78
tpep_pickup_datetime
2019-12-01 00:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2019-12-01 00:30:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2019-12-01 01:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2019-12-01 01:30:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2019-12-01 02:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 79 columns

# 訓練データの'area'を行データに展開
train_df = train_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'})
train_df.head()
tpep_pickup_datetime area data
0 2017-01-01 0 53.0
1 2017-01-01 1 16.0
2 2017-01-01 2 45.0
3 2017-01-01 3 38.0
4 2017-01-01 4 12.0
# テストデータの'area'を行データに展開
test_df = test_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'})
test_df.head()
tpep_pickup_datetime area data
0 2019-12-01 0 NaN
1 2019-12-01 1 NaN
2 2019-12-01 2 NaN
3 2019-12-01 3 NaN
4 2019-12-01 4 NaN
# 訓練データとテストデータを結合
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
all_df
tpep_pickup_datetime area data
0 2017-01-01 00:00:00 0 53.0
1 2017-01-01 00:00:00 1 16.0
2 2017-01-01 00:00:00 2 45.0
3 2017-01-01 00:00:00 3 38.0
4 2017-01-01 00:00:00 4 12.0
... ... ... ...
4061227 2019-12-07 23:30:00 74 NaN
4061228 2019-12-07 23:30:00 75 NaN
4061229 2019-12-07 23:30:00 76 NaN
4061230 2019-12-07 23:30:00 77 NaN
4061231 2019-12-07 23:30:00 78 NaN

4061232 rows × 3 columns

train_df.shape, test_df.shape, all_df.shape
((4034688, 3), (26544, 3), (4061232, 3))
# 各種日時データ列を追加
all_df['year'] = all_df.tpep_pickup_datetime.dt.year
all_df['month'] = all_df.tpep_pickup_datetime.dt.month
all_df['weekday'] = all_df.tpep_pickup_datetime.dt.weekday
all_df['day'] = all_df.tpep_pickup_datetime.dt.day
all_df['hour'] = all_df.tpep_pickup_datetime.dt.hour
all_df['minute'] = all_df.tpep_pickup_datetime.dt.minute
all_df.head()
tpep_pickup_datetime area data year month weekday day hour minute
0 2017-01-01 0 53.0 2017 1 6 1 0 0
1 2017-01-01 1 16.0 2017 1 6 1 0 0
2 2017-01-01 2 45.0 2017 1 6 1 0 0
3 2017-01-01 3 38.0 2017 1 6 1 0 0
4 2017-01-01 4 12.0 2017 1 6 1 0 0
# エリア毎の月平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['month', 'area']).agg(monthly_average_per_area=('data', 'mean'),
                                               monthly_min_value_per_area=('data', 'min'),
                                               monthly_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['month', 'area'], how='left')
# エリア毎の時平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['hour', 'area']).agg(hourly_average_per_area=('data', 'mean'),
                                              hourly_min_value_per_area=('data', 'min'),
                                              hourly_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['hour', 'area'], how='left')
# エリア毎の曜日平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['weekday', 'area']).agg(dayofweek_average_per_area=('data', 'mean'),
                                                 dayofweek_min_value_per_area=('data', 'min'),
                                                 dayofweek_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['weekday', 'area'], how='left')
# 各種日付データをカテゴリ型に変換をする
all_df['area'] = all_df['area'].astype('category')
all_df['year'] = all_df['year'].astype('category')
all_df['month'] = all_df['month'].astype('category')
all_df['weekday'] = all_df['weekday'].astype('category')
all_df['day'] = all_df['day'].astype('category')
all_df['hour'] = all_df['hour'].astype('category')
all_df['minute'] = all_df['minute'].astype('category')
# 全データ表示
all_df
tpep_pickup_datetime area data year month weekday day hour minute monthly_average_per_area monthly_min_value_per_area monthly_max_value_per_area hourly_average_per_area hourly_min_value_per_area hourly_max_value_per_area dayofweek_average_per_area dayofweek_min_value_per_area dayofweek_max_value_per_area
0 2017-01-01 00:00:00 0 53.0 2017 1 6 1 0 0 13.712142 0.0 108.0 19.604323 0.0 105.0 16.595532 0.0 160.0
1 2017-01-01 00:00:00 1 16.0 2017 1 6 1 0 0 9.730287 0.0 113.0 11.660244 1.0 62.0 12.023849 0.0 113.0
2 2017-01-01 00:00:00 2 45.0 2017 1 6 1 0 0 50.358647 0.0 189.0 18.289944 0.0 61.0 37.474644 0.0 134.0
3 2017-01-01 00:00:00 3 38.0 2017 1 6 1 0 0 17.027554 0.0 58.0 9.224624 0.0 56.0 16.848684 0.0 65.0
4 2017-01-01 00:00:00 4 12.0 2017 1 6 1 0 0 5.450493 0.0 38.0 8.089286 0.0 53.0 5.900630 0.0 54.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4061227 2019-12-07 23:30:00 74 NaN 2019 12 5 7 23 30 4.454301 0.0 59.0 12.611842 0.0 56.0 8.400082 0.0 71.0
4061228 2019-12-07 23:30:00 75 NaN 2019 12 5 7 23 30 3.347110 0.0 37.0 2.109492 0.0 11.0 4.022889 0.0 25.0
4061229 2019-12-07 23:30:00 76 NaN 2019 12 5 7 23 30 37.886089 0.0 142.0 26.794173 1.0 88.0 40.242188 0.0 134.0
4061230 2019-12-07 23:30:00 77 NaN 2019 12 5 7 23 30 72.863911 0.0 252.0 21.752350 2.0 100.0 63.004386 0.0 221.0
4061231 2019-12-07 23:30:00 78 NaN 2019 12 5 7 23 30 115.580309 3.0 400.0 79.685150 16.0 274.0 118.393914 5.0 378.0

4061232 rows × 18 columns

# 全特徴量の型を表示
all_df.dtypes.rename('dtype').to_frame()
dtype
tpep_pickup_datetime datetime64[ns]
area category
data float64
year category
month category
weekday category
day category
hour category
minute category
monthly_average_per_area float64
monthly_min_value_per_area float64
monthly_max_value_per_area float64
hourly_average_per_area float64
hourly_min_value_per_area float64
hourly_max_value_per_area float64
dayofweek_average_per_area float64
dayofweek_min_value_per_area float64
dayofweek_max_value_per_area float64
# 各特徴量の統計値を表示
all_df.describe()
data monthly_average_per_area monthly_min_value_per_area monthly_max_value_per_area hourly_average_per_area hourly_min_value_per_area hourly_max_value_per_area dayofweek_average_per_area dayofweek_min_value_per_area dayofweek_max_value_per_area
count 4.034688e+06 4.061232e+06 4.061232e+06 4.061232e+06 4.061232e+06 4.061232e+06 4.061232e+06 4.061232e+06 4.061232e+06 4.061232e+06
mean 7.110392e+01 7.111683e+01 1.020104e+00 2.488821e+02 7.110392e+01 1.134546e+01 1.828101e+02 7.110392e+01 1.009042e+00 2.463635e+02
std 9.300114e+01 6.828550e+01 2.699027e+00 2.234952e+02 8.435561e+01 1.831785e+01 1.780070e+02 6.941478e+01 2.687778e+00 2.171023e+02
min 0.000000e+00 2.244848e+00 0.000000e+00 1.600000e+01 2.509398e-01 0.000000e+00 4.000000e+00 2.393777e+00 0.000000e+00 1.700000e+01
25% 6.000000e+00 8.141353e+00 0.000000e+00 5.900000e+01 7.249648e+00 0.000000e+00 4.100000e+01 8.011787e+00 0.000000e+00 6.400000e+01
50% 2.700000e+01 5.099126e+01 0.000000e+00 1.770000e+02 3.150681e+01 2.000000e+00 1.190000e+02 4.940433e+01 0.000000e+00 1.870000e+02
75% 1.080000e+02 1.186290e+02 1.000000e+00 3.790000e+02 1.133950e+02 1.600000e+01 2.822500e+02 1.188420e+02 1.000000e+00 3.730000e+02
max 1.721000e+03 2.634612e+02 2.400000e+01 1.721000e+03 4.157904e+02 1.050000e+02 1.721000e+03 2.776369e+02 2.800000e+01 1.721000e+03
# train_dfとtest_dfを再分割(とりあえずdata がNone かどうかで判定)
train_df = all_df[all_df.data.notna()]
test_df = all_df[all_df.data.isna()]
train_df.shape, test_df.shape, all_df.shape
((4034688, 18), (26544, 18), (4061232, 18))
# train_dfを訓練データと検証データに分割
train_x , valid_x, train_y , valid_y = train_test_split(
    train_df.drop(['tpep_pickup_datetime', 'data'], axis=1),
    train_df['data'],
    shuffle=True,
    train_size=0.7,
    random_state=seed)

# データセット作成
lgb_train = lgb.Dataset(train_x, train_y)
lgb_valid = lgb.Dataset(valid_x, valid_y)

# test_dfよりテストデータを作成
test_x = test_df.drop(['tpep_pickup_datetime', 'data'], axis=1)

# パイパーパラメータを設定
lgb_params = {
    'task': 'train',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'random_state': seed,
    'verbose': -1,
}

# 学習
model = lgb.train(lgb_params,
                  lgb_train,
                  valid_sets=[lgb_valid],
                  num_boost_round=num_estimators,
                  callbacks=[lgb.log_evaluation(100),])

# テストデータを予測
result = model.predict(test_x)

# 結果をデータフレームに格納
result_df = pd.DataFrame({'result': result},
                         index=test_df.set_index(['tpep_pickup_datetime', 'area']).index)
[100]	valid_0's rmse: 23.5798
[200]	valid_0's rmse: 21.4718
[300]	valid_0's rmse: 20.3439
[400]	valid_0's rmse: 19.6296
[500]	valid_0's rmse: 19.1288
[600]	valid_0's rmse: 18.7623
[700]	valid_0's rmse: 18.4089
[800]	valid_0's rmse: 18.1754
[900]	valid_0's rmse: 17.9937
[1000]	valid_0's rmse: 17.8213
# 結果を表示
result_df.head()
result
tpep_pickup_datetime area
2019-12-01 0 30.381623
1 14.226875
2 16.777371
3 11.478026
4 8.742821
# SCORE表示
score = model.best_score['valid_0']['rmse']
f'RMSE : {score:0.6f}'
'RMSE : 17.821345'
# importanceを表示
lgb.plot_importance(model)
<Axes: title={'center': 'Feature importance'}, xlabel='Feature importance', ylabel='Features'>
# 結果データを提出形式に整形
result_df = result_df.reset_index('area')
result_df['area'] = result_df['area'].astype(int)
result_df = result_df.pivot(columns='area', values='result')
result_df.head()
area 0 1 2 3 4 5 6 7 8 9 ... 69 70 71 72 73 74 75 76 77 78
tpep_pickup_datetime
2019-12-01 00:00:00 30.381623 14.226875 16.777371 11.478026 8.742821 5.154717 3.418391 21.173734 10.173627 41.067771 ... 99.884228 6.951456 80.057519 245.432535 24.356545 18.865365 4.283819 21.201495 24.517502 95.133767
2019-12-01 00:30:00 25.631803 11.665351 11.051193 9.645737 7.130247 3.931056 2.548995 18.657961 8.478401 32.042728 ... 82.517022 5.604167 73.386051 231.312328 19.861427 16.233227 3.384932 17.924183 19.482027 86.477271
2019-12-01 01:00:00 30.971865 15.091661 9.600159 9.606890 6.206393 3.984010 2.625970 16.478715 8.767537 24.569169 ... 69.362340 5.296543 79.165217 206.271113 20.064091 18.107689 4.780220 14.125106 15.708135 76.388142
2019-12-01 01:30:00 27.791511 13.532622 5.718441 8.740958 5.846688 3.539878 2.181839 15.157938 8.325181 18.990956 ... 56.451068 4.936838 73.875655 187.656475 17.715436 16.463136 4.306597 12.005302 12.635795 70.187652
2019-12-01 02:00:00 26.120436 14.669071 5.444948 7.837083 4.513673 3.060245 2.192699 12.098949 7.454615 15.637754 ... 40.653569 4.786732 86.643541 187.924685 19.566583 16.625940 4.610008 9.722015 11.851747 56.631752

5 rows × 79 columns

# 結果をCSVファイルとして出力
result_df.to_csv(output_file)

添付データ

  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。