LightGBM Baseline (LB: 22.54002)

タクシー需要予測のLGBMを使用したサンプルです。

データとしては train_data.csv のみを使用しており「地区データ」「気象データ」は使用しておりません。

＊ Google Colabで動作可、GPUは不要となります

import lightgbm as lgb
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import random
import datetime
import os

# 設定値
input_path = "/gdrive/MyDrive/Colab Notebooks/ProbSpace/taxi_demand/input"
num_estimators = 1000
seed = 1111
output_file = 'output.csv'

# 乱数初期化
random.seed(seed)
np.random.seed(seed)

# データ読み込み
train_df = pd.read_csv(os.path.join(input_path, 'train_data.csv'), index_col='tpep_pickup_datetime')
train_df.index = pd.to_datetime(train_df.index)
train_df.head()

	0	1	2	3	4	5	6	7	8	9	...	69	70	71	72	73	74	75	76	77	78
tpep_pickup_datetime
2017-01-01 00:00:00	53.0	16.0	45.0	38.0	12.0	6.0	2.0	47.0	31.0	238.0	...	260.0	12.0	139.0	253.0	17.0	33.0	5.0	14.0	92.0	201.0
2017-01-01 00:30:00	83.0	62.0	59.0	56.0	19.0	26.0	8.0	91.0	48.0	165.0	...	357.0	45.0	159.0	280.0	46.0	50.0	10.0	30.0	110.0	349.0
2017-01-01 01:00:00	69.0	83.0	58.0	45.0	21.0	27.0	23.0	102.0	62.0	113.0	...	355.0	74.0	193.0	232.0	71.0	54.0	10.0	34.0	124.0	386.0
2017-01-01 01:30:00	76.0	87.0	56.0	36.0	27.0	18.0	25.0	84.0	87.0	81.0	...	328.0	61.0	137.0	208.0	68.0	75.0	12.0	44.0	91.0	373.0
2017-01-01 02:00:00	101.0	113.0	43.0	33.0	32.0	24.0	14.0	88.0	69.0	68.0	...	320.0	63.0	125.0	179.0	87.0	78.0	22.0	53.0	85.0	341.0

5 rows × 79 columns

# テストデータ(結果データ)の雛形を生成
test_df = pd.DataFrame(None,
                      index = pd.date_range('2019-12-01 00:00:00', '2019-12-07 23:30:00', freq='30T'),
                      columns=[f'{n}' for n in range(0,79)])
test_df.index.name = 'tpep_pickup_datetime'
test_df.head()

	0	1	2	3	4	5	6	7	8	9	...	69	70	71	72	73	74	75	76	77	78
tpep_pickup_datetime
2019-12-01 00:00:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2019-12-01 00:30:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2019-12-01 01:00:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2019-12-01 01:30:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2019-12-01 02:00:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 79 columns

# 訓練データの'area'を行データに展開
train_df = train_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'})
train_df.head()

	tpep_pickup_datetime	area	data
0	2017-01-01	0	53.0
1	2017-01-01	1	16.0
2	2017-01-01	2	45.0
3	2017-01-01	3	38.0
4	2017-01-01	4	12.0

# テストデータの'area'を行データに展開
test_df = test_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'})
test_df.head()

	tpep_pickup_datetime	area	data
0	2019-12-01	0	NaN
1	2019-12-01	1	NaN
2	2019-12-01	2	NaN
3	2019-12-01	3	NaN
4	2019-12-01	4	NaN

# 訓練データとテストデータを結合
all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
all_df

	tpep_pickup_datetime	area	data
0	2017-01-01 00:00:00	0	53.0
1	2017-01-01 00:00:00	1	16.0
2	2017-01-01 00:00:00	2	45.0
3	2017-01-01 00:00:00	3	38.0
4	2017-01-01 00:00:00	4	12.0
...	...	...	...
4061227	2019-12-07 23:30:00	74	NaN
4061228	2019-12-07 23:30:00	75	NaN
4061229	2019-12-07 23:30:00	76	NaN
4061230	2019-12-07 23:30:00	77	NaN
4061231	2019-12-07 23:30:00	78	NaN

4061232 rows × 3 columns

train_df.shape, test_df.shape, all_df.shape

((4034688, 3), (26544, 3), (4061232, 3))

# 各種日時データ列を追加
all_df['year'] = all_df.tpep_pickup_datetime.dt.year
all_df['month'] = all_df.tpep_pickup_datetime.dt.month
all_df['weekday'] = all_df.tpep_pickup_datetime.dt.weekday
all_df['day'] = all_df.tpep_pickup_datetime.dt.day
all_df['hour'] = all_df.tpep_pickup_datetime.dt.hour
all_df['minute'] = all_df.tpep_pickup_datetime.dt.minute
all_df.head()

	tpep_pickup_datetime	area	data	year	month	weekday	day
0	2017-01-01	0	53.0	2017	1	6	1
1	2017-01-01	1	16.0	2017	1	6	1
2	2017-01-01	2	45.0	2017	1	6	1
3	2017-01-01	3	38.0	2017	1	6	1
4	2017-01-01	4	12.0	2017	1	6	1

# エリア毎の月平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['month', 'area']).agg(monthly_average_per_area=('data', 'mean'),
                                               monthly_min_value_per_area=('data', 'min'),
                                               monthly_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['month', 'area'], how='left')

# エリア毎の時平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['hour', 'area']).agg(hourly_average_per_area=('data', 'mean'),
                                              hourly_min_value_per_area=('data', 'min'),
                                              hourly_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['hour', 'area'], how='left')

# エリア毎の曜日平均値・最大値・最小値を追加
tmp_df = all_df.groupby(['weekday', 'area']).agg(dayofweek_average_per_area=('data', 'mean'),
                                                 dayofweek_min_value_per_area=('data', 'min'),
                                                 dayofweek_max_value_per_area=('data', 'max')).reset_index()
all_df = pd.merge(all_df, tmp_df, on=['weekday', 'area'], how='left')

# 各種日付データをカテゴリ型に変換をする
all_df['area'] = all_df['area'].astype('category')
all_df['year'] = all_df['year'].astype('category')
all_df['month'] = all_df['month'].astype('category')
all_df['weekday'] = all_df['weekday'].astype('category')
all_df['day'] = all_df['day'].astype('category')
all_df['hour'] = all_df['hour'].astype('category')
all_df['minute'] = all_df['minute'].astype('category')

# 全データ表示
all_df

	tpep_pickup_datetime	area	data	year	month	weekday	day	hour	minute	monthly_average_per_area	monthly_min_value_per_area	monthly_max_value_per_area	hourly_average_per_area	hourly_min_value_per_area	hourly_max_value_per_area	dayofweek_average_per_area	dayofweek_min_value_per_area	dayofweek_max_value_per_area
0	2017-01-01 00:00:00	0	53.0	2017	1	6	1	0	0	13.712142	0.0	108.0	19.604323	0.0	105.0	16.595532	0.0	160.0
1	2017-01-01 00:00:00	1	16.0	2017	1	6	1	0	0	9.730287	0.0	113.0	11.660244	1.0	62.0	12.023849	0.0	113.0
2	2017-01-01 00:00:00	2	45.0	2017	1	6	1	0	0	50.358647	0.0	189.0	18.289944	0.0	61.0	37.474644	0.0	134.0
3	2017-01-01 00:00:00	3	38.0	2017	1	6	1	0	0	17.027554	0.0	58.0	9.224624	0.0	56.0	16.848684	0.0	65.0
4	2017-01-01 00:00:00	4	12.0	2017	1	6	1	0	0	5.450493	0.0	38.0	8.089286	0.0	53.0	5.900630	0.0	54.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4061227	2019-12-07 23:30:00	74	NaN	2019	12	5	7	23	30	4.454301	0.0	59.0	12.611842	0.0	56.0	8.400082	0.0	71.0
4061228	2019-12-07 23:30:00	75	NaN	2019	12	5	7	23	30	3.347110	0.0	37.0	2.109492	0.0	11.0	4.022889	0.0	25.0
4061229	2019-12-07 23:30:00	76	NaN	2019	12	5	7	23	30	37.886089	0.0	142.0	26.794173	1.0	88.0	40.242188	0.0	134.0
4061230	2019-12-07 23:30:00	77	NaN	2019	12	5	7	23	30	72.863911	0.0	252.0	21.752350	2.0	100.0	63.004386	0.0	221.0
4061231	2019-12-07 23:30:00	78	NaN	2019	12	5	7	23	30	115.580309	3.0	400.0	79.685150	16.0	274.0	118.393914	5.0	378.0

4061232 rows × 18 columns

# 全特徴量の型を表示
all_df.dtypes.rename('dtype').to_frame()

	dtype
tpep_pickup_datetime	datetime64[ns]
area	category
data	float64
year	category
month	category
weekday	category
day	category
hour	category
minute	category
monthly_average_per_area	float64
monthly_min_value_per_area	float64
monthly_max_value_per_area	float64
hourly_average_per_area	float64
hourly_min_value_per_area	float64
hourly_max_value_per_area	float64
dayofweek_average_per_area	float64
dayofweek_min_value_per_area	float64
dayofweek_max_value_per_area	float64

# 各特徴量の統計値を表示
all_df.describe()

	data	monthly_average_per_area	monthly_min_value_per_area	monthly_max_value_per_area	hourly_average_per_area	hourly_min_value_per_area	hourly_max_value_per_area	dayofweek_average_per_area	dayofweek_min_value_per_area	dayofweek_max_value_per_area
count	4.034688e+06	4.061232e+06	4.061232e+06	4.061232e+06	4.061232e+06	4.061232e+06	4.061232e+06	4.061232e+06	4.061232e+06	4.061232e+06
mean	7.110392e+01	7.111683e+01	1.020104e+00	2.488821e+02	7.110392e+01	1.134546e+01	1.828101e+02	7.110392e+01	1.009042e+00	2.463635e+02
std	9.300114e+01	6.828550e+01	2.699027e+00	2.234952e+02	8.435561e+01	1.831785e+01	1.780070e+02	6.941478e+01	2.687778e+00	2.171023e+02
min	0.000000e+00	2.244848e+00	0.000000e+00	1.600000e+01	2.509398e-01	0.000000e+00	4.000000e+00	2.393777e+00	0.000000e+00	1.700000e+01
25%	6.000000e+00	8.141353e+00	0.000000e+00	5.900000e+01	7.249648e+00	0.000000e+00	4.100000e+01	8.011787e+00	0.000000e+00	6.400000e+01
50%	2.700000e+01	5.099126e+01	0.000000e+00	1.770000e+02	3.150681e+01	2.000000e+00	1.190000e+02	4.940433e+01	0.000000e+00	1.870000e+02
75%	1.080000e+02	1.186290e+02	1.000000e+00	3.790000e+02	1.133950e+02	1.600000e+01	2.822500e+02	1.188420e+02	1.000000e+00	3.730000e+02
max	1.721000e+03	2.634612e+02	2.400000e+01	1.721000e+03	4.157904e+02	1.050000e+02	1.721000e+03	2.776369e+02	2.800000e+01	1.721000e+03

# train_dfとtest_dfを再分割(とりあえずdata がNone かどうかで判定)
train_df = all_df[all_df.data.notna()]
test_df = all_df[all_df.data.isna()]
train_df.shape, test_df.shape, all_df.shape

((4034688, 18), (26544, 18), (4061232, 18))

# train_dfを訓練データと検証データに分割
train_x , valid_x, train_y , valid_y = train_test_split(
    train_df.drop(['tpep_pickup_datetime', 'data'], axis=1),
    train_df['data'],
    shuffle=True,
    train_size=0.7,
    random_state=seed)

# データセット作成
lgb_train = lgb.Dataset(train_x, train_y)
lgb_valid = lgb.Dataset(valid_x, valid_y)

# test_dfよりテストデータを作成
test_x = test_df.drop(['tpep_pickup_datetime', 'data'], axis=1)

# パイパーパラメータを設定
lgb_params = {
    'task': 'train',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'random_state': seed,
    'verbose': -1,
}

# 学習
model = lgb.train(lgb_params,
                  lgb_train,
                  valid_sets=[lgb_valid],
                  num_boost_round=num_estimators,
                  callbacks=[lgb.log_evaluation(100),])

# テストデータを予測
result = model.predict(test_x)

# 結果をデータフレームに格納
result_df = pd.DataFrame({'result': result},
                         index=test_df.set_index(['tpep_pickup_datetime', 'area']).index)

[100]	valid_0's rmse: 23.5798
[200]	valid_0's rmse: 21.4718
[300]	valid_0's rmse: 20.3439
[400]	valid_0's rmse: 19.6296
[500]	valid_0's rmse: 19.1288
[600]	valid_0's rmse: 18.7623
[700]	valid_0's rmse: 18.4089
[800]	valid_0's rmse: 18.1754
[900]	valid_0's rmse: 17.9937
[1000]	valid_0's rmse: 17.8213

# 結果を表示
result_df.head()

		result
tpep_pickup_datetime	area
2019-12-01	0	30.381623
	1	14.226875
	2	16.777371
	3	11.478026
	4	8.742821

# SCORE表示
score = model.best_score['valid_0']['rmse']
f'RMSE : {score:0.6f}'

'RMSE : 17.821345'

# importanceを表示
lgb.plot_importance(model)

<Axes: title={'center': 'Feature importance'}, xlabel='Feature importance', ylabel='Features'>

# 結果データを提出形式に整形
result_df = result_df.reset_index('area')
result_df['area'] = result_df['area'].astype(int)
result_df = result_df.pivot(columns='area', values='result')
result_df.head()

area	0	1	2	3	4	5	6	7	8	9	...	69	70	71	72	73	74	75	76	77	78
tpep_pickup_datetime
2019-12-01 00:00:00	30.381623	14.226875	16.777371	11.478026	8.742821	5.154717	3.418391	21.173734	10.173627	41.067771	...	99.884228	6.951456	80.057519	245.432535	24.356545	18.865365	4.283819	21.201495	24.517502	95.133767
2019-12-01 00:30:00	25.631803	11.665351	11.051193	9.645737	7.130247	3.931056	2.548995	18.657961	8.478401	32.042728	...	82.517022	5.604167	73.386051	231.312328	19.861427	16.233227	3.384932	17.924183	19.482027	86.477271
2019-12-01 01:00:00	30.971865	15.091661	9.600159	9.606890	6.206393	3.984010	2.625970	16.478715	8.767537	24.569169	...	69.362340	5.296543	79.165217	206.271113	20.064091	18.107689	4.780220	14.125106	15.708135	76.388142
2019-12-01 01:30:00	27.791511	13.532622	5.718441	8.740958	5.846688	3.539878	2.181839	15.157938	8.325181	18.990956	...	56.451068	4.936838	73.875655	187.656475	17.715436	16.463136	4.306597	12.005302	12.635795	70.187652
2019-12-01 02:00:00	26.120436	14.669071	5.444948	7.837083	4.513673	3.060245	2.192699	12.098949	7.454615	15.637754	...	40.653569	4.786732	86.643541	187.924685	19.566583	16.625940	4.610008	9.722015	11.851747	56.631752

5 rows × 79 columns

# 結果をCSVファイルとして出力
result_df.to_csv(output_file)

LightGBM Baseline (LB: 22.54002)

LightGBM Baseline (LB: 22.54002)

添付データ

new user