magichan
タクシー需要予測のLGBMを使用したサンプルです。
データとしては train_data.csv のみを使用しており「地区データ」「気象データ」は使用しておりません。
* Google Colabで動作可、GPUは不要となります
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random
import datetime
import os
# 設定値
input_path = "/gdrive/MyDrive/Colab Notebooks/ProbSpace/taxi_demand/input"
num_estimators = 1000
seed = 1111
output_file = 'output.csv'
# 乱数初期化
random.seed(seed)
np.random.seed(seed)
# データ読み込み
train_df = pd.read_csv(os.path.join(input_path, 'train_data.csv'), index_col='tpep_pickup_datetime')
train_df.index = pd.to_datetime(train_df.index)
train_df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
tpep_pickup_datetime | |||||||||||||||||||||
2017-01-01 00:00:00 | 53.0 | 16.0 | 45.0 | 38.0 | 12.0 | 6.0 | 2.0 | 47.0 | 31.0 | 238.0 | ... | 260.0 | 12.0 | 139.0 | 253.0 | 17.0 | 33.0 | 5.0 | 14.0 | 92.0 | 201.0 |
2017-01-01 00:30:00 | 83.0 | 62.0 | 59.0 | 56.0 | 19.0 | 26.0 | 8.0 | 91.0 | 48.0 | 165.0 | ... | 357.0 | 45.0 | 159.0 | 280.0 | 46.0 | 50.0 | 10.0 | 30.0 | 110.0 | 349.0 |
2017-01-01 01:00:00 | 69.0 | 83.0 | 58.0 | 45.0 | 21.0 | 27.0 | 23.0 | 102.0 | 62.0 | 113.0 | ... | 355.0 | 74.0 | 193.0 | 232.0 | 71.0 | 54.0 | 10.0 | 34.0 | 124.0 | 386.0 |
2017-01-01 01:30:00 | 76.0 | 87.0 | 56.0 | 36.0 | 27.0 | 18.0 | 25.0 | 84.0 | 87.0 | 81.0 | ... | 328.0 | 61.0 | 137.0 | 208.0 | 68.0 | 75.0 | 12.0 | 44.0 | 91.0 | 373.0 |
2017-01-01 02:00:00 | 101.0 | 113.0 | 43.0 | 33.0 | 32.0 | 24.0 | 14.0 | 88.0 | 69.0 | 68.0 | ... | 320.0 | 63.0 | 125.0 | 179.0 | 87.0 | 78.0 | 22.0 | 53.0 | 85.0 | 341.0 |
5 rows × 79 columns
# テストデータ(結果データ)の雛形を生成
test_df = pd.DataFrame(None,
index = pd.date_range('2019-12-01 00:00:00', '2019-12-07 23:30:00', freq='30T'),
columns=[f'{n}' for n in range(0,79)])
test_df.index.name = 'tpep_pickup_datetime'
test_df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
tpep_pickup_datetime | |||||||||||||||||||||
2019-12-01 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2019-12-01 00:30:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2019-12-01 01:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2019-12-01 01:30:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2019-12-01 02:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 79 columns
# 訓練データの'area'を行データに展開
train_df = train_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'})
train_df.head()
tpep_pickup_datetime | area | data | |
---|---|---|---|
0 | 2017-01-01 | 0 | 53.0 |
1 | 2017-01-01 | 1 | 16.0 |
2 | 2017-01-01 | 2 | 45.0 |
3 | 2017-01-01 | 3 | 38.0 |
4 | 2017-01-01 | 4 | 12.0 |
# テストデータの'area'を行データに展開
test_df = test_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'})
test_df.head()
tpep_pickup_datetime | area | data | |
---|---|---|---|
0 | 2019-12-01 | 0 | NaN |
1 | 2019-12-01 | 1 | NaN |
2 | 2019-12-01 | 2 | NaN |
3 | 2019-12-01 | 3 | NaN |
4 | 2019-12-01 | 4 | NaN |
# 訓練データとテストデータを結合
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
all_df
tpep_pickup_datetime | area | data | |
---|---|---|---|
0 | 2017-01-01 00:00:00 | 0 | 53.0 |
1 | 2017-01-01 00:00:00 | 1 | 16.0 |
2 | 2017-01-01 00:00:00 | 2 | 45.0 |
3 | 2017-01-01 00:00:00 | 3 | 38.0 |
4 | 2017-01-01 00:00:00 | 4 | 12.0 |
... | ... | ... | ... |
4061227 | 2019-12-07 23:30:00 | 74 | NaN |
4061228 | 2019-12-07 23:30:00 | 75 | NaN |
4061229 | 2019-12-07 23:30:00 | 76 | NaN |
4061230 | 2019-12-07 23:30:00 | 77 | NaN |
4061231 | 2019-12-07 23:30:00 | 78 | NaN |
4061232 rows × 3 columns
train_df.shape, test_df.shape, all_df.shape
((4034688, 3), (26544, 3), (4061232, 3))
# 各種日時データ列を追加
all_df['year'] = all_df.tpep_pickup_datetime.dt.year
all_df['month'] = all_df.tpep_pickup_datetime.dt.month
all_df['weekday'] = all_df.tpep_pickup_datetime.dt.weekday
all_df['day'] = all_df.tpep_pickup_datetime.dt.day
all_df['hour'] = all_df.tpep_pickup_datetime.dt.hour
all_df['minute'] = all_df.tpep_pickup_datetime.dt.minute
all_df.head()
tpep_pickup_datetime | area | data | year | month | weekday | day | hour | minute | |
---|---|---|---|---|---|---|---|---|---|
0 | 2017-01-01 | 0 | 53.0 | 2017 | 1 | 6 | 1 | 0 | 0 |
1 | 2017-01-01 | 1 | 16.0 | 2017 | 1 | 6 | 1 | 0 | 0 |
2 | 2017-01-01 | 2 | 45.0 | 2017 | 1 | 6 | 1 | 0 | 0 |
3 | 2017-01-01 | 3 | 38.0 | 2017 | 1 | 6 | 1 | 0 | 0 |
4 | 2017-01-01 | 4 | 12.0 | 2017 | 1 | 6 | 1 | 0 | 0 |
# エリア毎の月平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['month', 'area']).agg(monthly_average_per_area=('data', 'mean'),
monthly_min_value_per_area=('data', 'min'),
monthly_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['month', 'area'], how='left')
# エリア毎の時平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['hour', 'area']).agg(hourly_average_per_area=('data', 'mean'),
hourly_min_value_per_area=('data', 'min'),
hourly_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['hour', 'area'], how='left')
# エリア毎の曜日平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['weekday', 'area']).agg(dayofweek_average_per_area=('data', 'mean'),
dayofweek_min_value_per_area=('data', 'min'),
dayofweek_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['weekday', 'area'], how='left')
# 各種日付データをカテゴリ型に変換をする
all_df['area'] = all_df['area'].astype('category')
all_df['year'] = all_df['year'].astype('category')
all_df['month'] = all_df['month'].astype('category')
all_df['weekday'] = all_df['weekday'].astype('category')
all_df['day'] = all_df['day'].astype('category')
all_df['hour'] = all_df['hour'].astype('category')
all_df['minute'] = all_df['minute'].astype('category')
# 全データ表示
all_df
tpep_pickup_datetime | area | data | year | month | weekday | day | hour | minute | monthly_average_per_area | monthly_min_value_per_area | monthly_max_value_per_area | hourly_average_per_area | hourly_min_value_per_area | hourly_max_value_per_area | dayofweek_average_per_area | dayofweek_min_value_per_area | dayofweek_max_value_per_area | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017-01-01 00:00:00 | 0 | 53.0 | 2017 | 1 | 6 | 1 | 0 | 0 | 13.712142 | 0.0 | 108.0 | 19.604323 | 0.0 | 105.0 | 16.595532 | 0.0 | 160.0 |
1 | 2017-01-01 00:00:00 | 1 | 16.0 | 2017 | 1 | 6 | 1 | 0 | 0 | 9.730287 | 0.0 | 113.0 | 11.660244 | 1.0 | 62.0 | 12.023849 | 0.0 | 113.0 |
2 | 2017-01-01 00:00:00 | 2 | 45.0 | 2017 | 1 | 6 | 1 | 0 | 0 | 50.358647 | 0.0 | 189.0 | 18.289944 | 0.0 | 61.0 | 37.474644 | 0.0 | 134.0 |
3 | 2017-01-01 00:00:00 | 3 | 38.0 | 2017 | 1 | 6 | 1 | 0 | 0 | 17.027554 | 0.0 | 58.0 | 9.224624 | 0.0 | 56.0 | 16.848684 | 0.0 | 65.0 |
4 | 2017-01-01 00:00:00 | 4 | 12.0 | 2017 | 1 | 6 | 1 | 0 | 0 | 5.450493 | 0.0 | 38.0 | 8.089286 | 0.0 | 53.0 | 5.900630 | 0.0 | 54.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4061227 | 2019-12-07 23:30:00 | 74 | NaN | 2019 | 12 | 5 | 7 | 23 | 30 | 4.454301 | 0.0 | 59.0 | 12.611842 | 0.0 | 56.0 | 8.400082 | 0.0 | 71.0 |
4061228 | 2019-12-07 23:30:00 | 75 | NaN | 2019 | 12 | 5 | 7 | 23 | 30 | 3.347110 | 0.0 | 37.0 | 2.109492 | 0.0 | 11.0 | 4.022889 | 0.0 | 25.0 |
4061229 | 2019-12-07 23:30:00 | 76 | NaN | 2019 | 12 | 5 | 7 | 23 | 30 | 37.886089 | 0.0 | 142.0 | 26.794173 | 1.0 | 88.0 | 40.242188 | 0.0 | 134.0 |
4061230 | 2019-12-07 23:30:00 | 77 | NaN | 2019 | 12 | 5 | 7 | 23 | 30 | 72.863911 | 0.0 | 252.0 | 21.752350 | 2.0 | 100.0 | 63.004386 | 0.0 | 221.0 |
4061231 | 2019-12-07 23:30:00 | 78 | NaN | 2019 | 12 | 5 | 7 | 23 | 30 | 115.580309 | 3.0 | 400.0 | 79.685150 | 16.0 | 274.0 | 118.393914 | 5.0 | 378.0 |
4061232 rows × 18 columns
# 全特徴量の型を表示
all_df.dtypes.rename('dtype').to_frame()
dtype | |
---|---|
tpep_pickup_datetime | datetime64[ns] |
area | category |
data | float64 |
year | category |
month | category |
weekday | category |
day | category |
hour | category |
minute | category |
monthly_average_per_area | float64 |
monthly_min_value_per_area | float64 |
monthly_max_value_per_area | float64 |
hourly_average_per_area | float64 |
hourly_min_value_per_area | float64 |
hourly_max_value_per_area | float64 |
dayofweek_average_per_area | float64 |
dayofweek_min_value_per_area | float64 |
dayofweek_max_value_per_area | float64 |
# 各特徴量の統計値を表示
all_df.describe()
data | monthly_average_per_area | monthly_min_value_per_area | monthly_max_value_per_area | hourly_average_per_area | hourly_min_value_per_area | hourly_max_value_per_area | dayofweek_average_per_area | dayofweek_min_value_per_area | dayofweek_max_value_per_area | |
---|---|---|---|---|---|---|---|---|---|---|
count | 4.034688e+06 | 4.061232e+06 | 4.061232e+06 | 4.061232e+06 | 4.061232e+06 | 4.061232e+06 | 4.061232e+06 | 4.061232e+06 | 4.061232e+06 | 4.061232e+06 |
mean | 7.110392e+01 | 7.111683e+01 | 1.020104e+00 | 2.488821e+02 | 7.110392e+01 | 1.134546e+01 | 1.828101e+02 | 7.110392e+01 | 1.009042e+00 | 2.463635e+02 |
std | 9.300114e+01 | 6.828550e+01 | 2.699027e+00 | 2.234952e+02 | 8.435561e+01 | 1.831785e+01 | 1.780070e+02 | 6.941478e+01 | 2.687778e+00 | 2.171023e+02 |
min | 0.000000e+00 | 2.244848e+00 | 0.000000e+00 | 1.600000e+01 | 2.509398e-01 | 0.000000e+00 | 4.000000e+00 | 2.393777e+00 | 0.000000e+00 | 1.700000e+01 |
25% | 6.000000e+00 | 8.141353e+00 | 0.000000e+00 | 5.900000e+01 | 7.249648e+00 | 0.000000e+00 | 4.100000e+01 | 8.011787e+00 | 0.000000e+00 | 6.400000e+01 |
50% | 2.700000e+01 | 5.099126e+01 | 0.000000e+00 | 1.770000e+02 | 3.150681e+01 | 2.000000e+00 | 1.190000e+02 | 4.940433e+01 | 0.000000e+00 | 1.870000e+02 |
75% | 1.080000e+02 | 1.186290e+02 | 1.000000e+00 | 3.790000e+02 | 1.133950e+02 | 1.600000e+01 | 2.822500e+02 | 1.188420e+02 | 1.000000e+00 | 3.730000e+02 |
max | 1.721000e+03 | 2.634612e+02 | 2.400000e+01 | 1.721000e+03 | 4.157904e+02 | 1.050000e+02 | 1.721000e+03 | 2.776369e+02 | 2.800000e+01 | 1.721000e+03 |
# train_dfとtest_dfを再分割(とりあえずdata がNone かどうかで判定)
train_df = all_df[all_df.data.notna()]
test_df = all_df[all_df.data.isna()]
train_df.shape, test_df.shape, all_df.shape
((4034688, 18), (26544, 18), (4061232, 18))
# train_dfを訓練データと検証データに分割
train_x , valid_x, train_y , valid_y = train_test_split(
train_df.drop(['tpep_pickup_datetime', 'data'], axis=1),
train_df['data'],
shuffle=True,
train_size=0.7,
random_state=seed)
# データセット作成
lgb_train = lgb.Dataset(train_x, train_y)
lgb_valid = lgb.Dataset(valid_x, valid_y)
# test_dfよりテストデータを作成
test_x = test_df.drop(['tpep_pickup_datetime', 'data'], axis=1)
# パイパーパラメータを設定
lgb_params = {
'task': 'train',
'objective': 'regression',
'metric': 'rmse',
'learning_rate': 0.1,
'random_state': seed,
'verbose': -1,
}
# 学習
model = lgb.train(lgb_params,
lgb_train,
valid_sets=[lgb_valid],
num_boost_round=num_estimators,
callbacks=[lgb.log_evaluation(100),])
# テストデータを予測
result = model.predict(test_x)
# 結果をデータフレームに格納
result_df = pd.DataFrame({'result': result},
index=test_df.set_index(['tpep_pickup_datetime', 'area']).index)
[100] valid_0's rmse: 23.5798 [200] valid_0's rmse: 21.4718 [300] valid_0's rmse: 20.3439 [400] valid_0's rmse: 19.6296 [500] valid_0's rmse: 19.1288 [600] valid_0's rmse: 18.7623 [700] valid_0's rmse: 18.4089 [800] valid_0's rmse: 18.1754 [900] valid_0's rmse: 17.9937 [1000] valid_0's rmse: 17.8213
# 結果を表示
result_df.head()
result | ||
---|---|---|
tpep_pickup_datetime | area | |
2019-12-01 | 0 | 30.381623 |
1 | 14.226875 | |
2 | 16.777371 | |
3 | 11.478026 | |
4 | 8.742821 |
# SCORE表示
score = model.best_score['valid_0']['rmse']
f'RMSE : {score:0.6f}'
'RMSE : 17.821345'
# importanceを表示
lgb.plot_importance(model)
<Axes: title={'center': 'Feature importance'}, xlabel='Feature importance', ylabel='Features'>
# 結果データを提出形式に整形
result_df = result_df.reset_index('area')
result_df['area'] = result_df['area'].astype(int)
result_df = result_df.pivot(columns='area', values='result')
result_df.head()
area | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
tpep_pickup_datetime | |||||||||||||||||||||
2019-12-01 00:00:00 | 30.381623 | 14.226875 | 16.777371 | 11.478026 | 8.742821 | 5.154717 | 3.418391 | 21.173734 | 10.173627 | 41.067771 | ... | 99.884228 | 6.951456 | 80.057519 | 245.432535 | 24.356545 | 18.865365 | 4.283819 | 21.201495 | 24.517502 | 95.133767 |
2019-12-01 00:30:00 | 25.631803 | 11.665351 | 11.051193 | 9.645737 | 7.130247 | 3.931056 | 2.548995 | 18.657961 | 8.478401 | 32.042728 | ... | 82.517022 | 5.604167 | 73.386051 | 231.312328 | 19.861427 | 16.233227 | 3.384932 | 17.924183 | 19.482027 | 86.477271 |
2019-12-01 01:00:00 | 30.971865 | 15.091661 | 9.600159 | 9.606890 | 6.206393 | 3.984010 | 2.625970 | 16.478715 | 8.767537 | 24.569169 | ... | 69.362340 | 5.296543 | 79.165217 | 206.271113 | 20.064091 | 18.107689 | 4.780220 | 14.125106 | 15.708135 | 76.388142 |
2019-12-01 01:30:00 | 27.791511 | 13.532622 | 5.718441 | 8.740958 | 5.846688 | 3.539878 | 2.181839 | 15.157938 | 8.325181 | 18.990956 | ... | 56.451068 | 4.936838 | 73.875655 | 187.656475 | 17.715436 | 16.463136 | 4.306597 | 12.005302 | 12.635795 | 70.187652 |
2019-12-01 02:00:00 | 26.120436 | 14.669071 | 5.444948 | 7.837083 | 4.513673 | 3.060245 | 2.192699 | 12.098949 | 7.454615 | 15.637754 | ... | 40.653569 | 4.786732 | 86.643541 | 187.924685 | 19.566583 | 16.625940 | 4.610008 | 9.722015 | 11.851747 | 56.631752 |
5 rows × 79 columns
# 結果をCSVファイルとして出力
result_df.to_csv(output_file)