未来のタクシー需要を先読みしよう!
magichan
タクシー需要予測のLGBMを使用したサンプルです。
データとしては train_data.csv のみを使用しており「地区データ」「気象データ」は使用しておりません。
* Google Colabで動作可、GPUは不要となります
import lightgbm as lgb from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import random import datetime import os
# 設定値 input_path = "/gdrive/MyDrive/Colab Notebooks/ProbSpace/taxi_demand/input" num_estimators = 1000 seed = 1111 output_file = 'output.csv'
# 乱数初期化 random.seed(seed) np.random.seed(seed)
# データ読み込み train_df = pd.read_csv(os.path.join(input_path, 'train_data.csv'), index_col='tpep_pickup_datetime') train_df.index = pd.to_datetime(train_df.index) train_df.head()
5 rows × 79 columns
# テストデータ(結果データ)の雛形を生成 test_df = pd.DataFrame(None, index = pd.date_range('2019-12-01 00:00:00', '2019-12-07 23:30:00', freq='30T'), columns=[f'{n}' for n in range(0,79)]) test_df.index.name = 'tpep_pickup_datetime' test_df.head()
# 訓練データの'area'を行データに展開 train_df = train_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'}) train_df.head()
# テストデータの'area'を行データに展開 test_df = test_df.stack(dropna=False).reset_index().rename(columns={0:'data', 'level_1': 'area'}) test_df.head()
# 訓練データとテストデータを結合 all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True) all_df
4061232 rows × 3 columns
train_df.shape, test_df.shape, all_df.shape
((4034688, 3), (26544, 3), (4061232, 3))
# 各種日時データ列を追加 all_df['year'] = all_df.tpep_pickup_datetime.dt.year all_df['month'] = all_df.tpep_pickup_datetime.dt.month all_df['weekday'] = all_df.tpep_pickup_datetime.dt.weekday all_df['day'] = all_df.tpep_pickup_datetime.dt.day all_df['hour'] = all_df.tpep_pickup_datetime.dt.hour all_df['minute'] = all_df.tpep_pickup_datetime.dt.minute all_df.head()
# エリア毎の月平均値・最大値・最小値を追加 tmp_df = all_df.groupby(['month', 'area']).agg(monthly_average_per_area=('data', 'mean'), monthly_min_value_per_area=('data', 'min'), monthly_max_value_per_area=('data', 'max')).reset_index() all_df = pd.merge(all_df, tmp_df, on=['month', 'area'], how='left')
# エリア毎の時平均値・最大値・最小値を追加 tmp_df = all_df.groupby(['hour', 'area']).agg(hourly_average_per_area=('data', 'mean'), hourly_min_value_per_area=('data', 'min'), hourly_max_value_per_area=('data', 'max')).reset_index() all_df = pd.merge(all_df, tmp_df, on=['hour', 'area'], how='left')
# エリア毎の曜日平均値・最大値・最小値を追加 tmp_df = all_df.groupby(['weekday', 'area']).agg(dayofweek_average_per_area=('data', 'mean'), dayofweek_min_value_per_area=('data', 'min'), dayofweek_max_value_per_area=('data', 'max')).reset_index() all_df = pd.merge(all_df, tmp_df, on=['weekday', 'area'], how='left')
# 各種日付データをカテゴリ型に変換をする all_df['area'] = all_df['area'].astype('category') all_df['year'] = all_df['year'].astype('category') all_df['month'] = all_df['month'].astype('category') all_df['weekday'] = all_df['weekday'].astype('category') all_df['day'] = all_df['day'].astype('category') all_df['hour'] = all_df['hour'].astype('category') all_df['minute'] = all_df['minute'].astype('category')
# 全データ表示 all_df
4061232 rows × 18 columns
# 全特徴量の型を表示 all_df.dtypes.rename('dtype').to_frame()
# 各特徴量の統計値を表示 all_df.describe()
# train_dfとtest_dfを再分割(とりあえずdata がNone かどうかで判定) train_df = all_df[all_df.data.notna()] test_df = all_df[all_df.data.isna()] train_df.shape, test_df.shape, all_df.shape
((4034688, 18), (26544, 18), (4061232, 18))
# train_dfを訓練データと検証データに分割 train_x , valid_x, train_y , valid_y = train_test_split( train_df.drop(['tpep_pickup_datetime', 'data'], axis=1), train_df['data'], shuffle=True, train_size=0.7, random_state=seed) # データセット作成 lgb_train = lgb.Dataset(train_x, train_y) lgb_valid = lgb.Dataset(valid_x, valid_y) # test_dfよりテストデータを作成 test_x = test_df.drop(['tpep_pickup_datetime', 'data'], axis=1) # パイパーパラメータを設定 lgb_params = { 'task': 'train', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.1, 'random_state': seed, 'verbose': -1, } # 学習 model = lgb.train(lgb_params, lgb_train, valid_sets=[lgb_valid], num_boost_round=num_estimators, callbacks=[lgb.log_evaluation(100),]) # テストデータを予測 result = model.predict(test_x) # 結果をデータフレームに格納 result_df = pd.DataFrame({'result': result}, index=test_df.set_index(['tpep_pickup_datetime', 'area']).index)
[100] valid_0's rmse: 23.5798 [200] valid_0's rmse: 21.4718 [300] valid_0's rmse: 20.3439 [400] valid_0's rmse: 19.6296 [500] valid_0's rmse: 19.1288 [600] valid_0's rmse: 18.7623 [700] valid_0's rmse: 18.4089 [800] valid_0's rmse: 18.1754 [900] valid_0's rmse: 17.9937 [1000] valid_0's rmse: 17.8213
# 結果を表示 result_df.head()
# SCORE表示 score = model.best_score['valid_0']['rmse'] f'RMSE : {score:0.6f}'
'RMSE : 17.821345'
# importanceを表示 lgb.plot_importance(model)
<Axes: title={'center': 'Feature importance'}, xlabel='Feature importance', ylabel='Features'>
# 結果データを提出形式に整形 result_df = result_df.reset_index('area') result_df['area'] = result_df['area'].astype(int) result_df = result_df.pivot(columns='area', values='result') result_df.head()
# 結果をCSVファイルとして出力 result_df.to_csv(output_file)