未来のタクシー需要を先読みしよう!
T.T
参加者の皆様お疲れさまでした。運営様、コンペの開催ありがとうございました。3rd place solutionを共有します。
今回のコンペではLBのスコアを見ながらモデルを作る方法を試しました。それがPublicLBとPrivateLBの乖離につながったのかな。と思います。正直これ以上のスコアは出せなかったので、今の実力の限界です。モデル作成以降は前回のコンペからほぼコピペなので、伸びしろはあると考えています。130回目ぐらいまで何をやってもPublicLB20の壁を超えられず足踏みしていましたが、慣習的に4連休を取る人が多い感謝祭の次の週を当てる課題であると置き換えて以降、スコアが伸びはじめました。解法に加え、EDAで見つけた感謝祭の特徴と、タクシー需要に関するネットの書き込みについても共有します。備考:リファクタリングしたら若干スコアが変わりました。(Private LB 17.54004 -> 17.58470)3rd place solutionということで、ご了承ください。
from google.colab import drive drive.mount('/content/drive')
Mounted at /content/drive
import pandas as pd import numpy as np import matplotlib.pyplot as plt
class CFG: SEED = 42 IMP_TH = 90 class PATHS: MAIN_DIR = '/content/drive/MyDrive/share/competition/タクシー需要予測' TRAIN_DATA = MAIN_DIR + '/data/train_data.csv' WEATHER_DATA = MAIN_DIR + '/data/nyc_weather_2017_2019.csv' ZONE_DATA = MAIN_DIR + '/data/taxi_zones.csv' SUBMIT_DATA = MAIN_DIR + '/output/3rd_place_solution+EDA.csv'
from scipy import signal def signal_low_pass_filter(input_signal): input_signal = np.nan_to_num(input_signal, nan=0.0) b, a = signal.iirfilter(4, 0.1, btype='lowpass', analog=False) # 初期状態を計算 zi = signal.lfilter_zi(b, a) filtered_signal, zf = signal.lfilter(b, a, input_signal, zi=zi*input_signal[0]) return filtered_signal
# trainデータの読み込み train_data = pd.read_csv(PATHS.TRAIN_DATA, index_col='tpep_pickup_datetime',low_memory=False) area_cols_name = train_data.columns # test_dataを生成 date_index = pd.date_range(start='2019/12/01 00:00:00', end='2019/12/07 23:30:00', freq='30T') test_data = pd.DataFrame(columns=train_data.columns, index=date_index) train_data = pd.concat([train_data, test_data], axis=0) # 日時を分解 train_data['Year'] = pd.to_datetime(train_data.index).year train_data['Month'] = pd.to_datetime(train_data.index).month train_data['Day'] = pd.to_datetime(train_data.index).day train_data['hour'] = pd.to_datetime(train_data.index).hour train_data['min'] = pd.to_datetime(train_data.index).minute train_data['weekday'] = pd.to_datetime(train_data.index).weekday
# 強めのローパスフィルタをかけると、Thanksgiving Dayの日はタクシー利用率が下がる傾向が確認できる # (2017-11-23, 2018-11-22, 2019-11-28) # 2018-11-15も下降傾向にあるが、最後上昇している、絶対値が大きい。といった異なる特徴を持っている # Thanksgiving Dayは家でお祝いしている人が多く、4連休を取る人も多いらしい # https://www.tripadvisor.co.uk/ShowTopic-g60763-i5-k5899614-Taxi_from_JFK_on_black_Friday-New_York_City_New_York.html # EDAで見つけなくても利用許可された祝日ではある # https://www.cs.ny.gov/attendance_leave/2019_legal_holidays.cfm plt_df = train_data[train_data['Month'] == 11] for index in range(7): # 指定した曜日のデータを取得 weekday_data = plt_df[plt_df['weekday'] == index] # 曜日ごとにデータを重ねてプロット(乗車数の多い33を使って特徴を観測) plt.figure(figsize=(8, 3)) for _, plot_data in weekday_data.groupby(weekday_data['Day']): plot_data['filtered_33'] = signal_low_pass_filter(plot_data['33'].values) plot_data['filtered_33'] = (plot_data['filtered_33'] - plot_data['filtered_33'].min()) / (plot_data['filtered_33'].max() - plot_data['filtered_33'].min()) plt.plot(plot_data['filtered_33'].values, label=plot_data.index[0]) # 凡例を左外に表示 plt.legend(loc='upper left', bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure) plt.show()
import re # 独自の置換関数を定義。replace関数だとNaNになってしまう行があった def replace_text_re(text): if isinstance(text, str): # 列の要素が文字列であるかを確認 # 英語の文字(アルファベット)をスペースに置き換える正規表現 pattern = re.compile(r'[a-zA-Z]+') # 正規表現にマッチする部分をスペースに置き換える replaced_text = pattern.sub('', text) return replaced_text else: return text # 文字列でない場合はそのまま返す # 天候データの読み込み weather_data_org = pd.read_csv(PATHS.WEATHER_DATA, index_col='DATE',low_memory=False) weather_data_org.index = pd.to_datetime(weather_data_org.index) #欠落している情報を直前の値で補間 lack_weather_info = pd.DataFrame(weather_data_org.loc['2019-05-09 22:51:00']).T.rename_axis('DATE') lack_weather_info.index = ['2019-05-09 23:00:00'] weather_data_org = pd.concat([weather_data_org, lack_weather_info], axis=0) weather_data_org.fillna(method='ffill',inplace=True) # 日時を分解 weather_data_org['Year'] = pd.to_datetime(weather_data_org.index).year weather_data_org['Month'] = pd.to_datetime(weather_data_org.index).month weather_data_org['Day'] = pd.to_datetime(weather_data_org.index).day weather_data_org['hour'] = pd.to_datetime(weather_data_org.index).hour # データクレンジング weather_data_org['HourlyPrecipitation'] = weather_data_org['HourlyPrecipitation'].str.replace('s', '') weather_data_org['HourlyPrecipitation'] = weather_data_org['HourlyPrecipitation'].str.replace('T', '0') weather_data_org['HourlyPrecipitation'] = pd.to_numeric(weather_data_org['HourlyPrecipitation']) #.replaceを使うとうまく行かないので、置換用の関数を使用(理由は不明) weather_data_org['HourlyDryBulbTemperature'] = weather_data_org['HourlyDryBulbTemperature'].apply(replace_text_re) weather_data_org['HourlyDryBulbTemperature'] = weather_data_org['HourlyDryBulbTemperature'].astype(np.float16) weather_data_org['HourlyDewPointTemperature'] = weather_data_org['HourlyDewPointTemperature'].apply(replace_text_re) weather_data_org['HourlyDewPointTemperature'] = weather_data_org['HourlyDewPointTemperature'].astype(np.float16) weather_data_org['HourlyWetBulbTemperature'] = weather_data_org['HourlyWetBulbTemperature'].astype(np.float16) weather_data_org['HourlyAltimeterSetting'] = weather_data_org['HourlyAltimeterSetting'].apply(replace_text_re) weather_data_org['HourlyAltimeterSetting'] = weather_data_org['HourlyAltimeterSetting'].astype(np.float16) weather_data_org['HourlySeaLevelPressure'] = weather_data_org['HourlySeaLevelPressure'].apply(replace_text_re) weather_data_org['HourlySeaLevelPressure'] = weather_data_org['HourlySeaLevelPressure'].astype(np.float16) weather_data_org['HourlyStationPressure'] = weather_data_org['HourlyStationPressure'].apply(replace_text_re) weather_data_org['HourlyStationPressure'] = weather_data_org['HourlyStationPressure'].astype(np.float16) weather_data_org['HourlyVisibility'] = weather_data_org['HourlyVisibility'].apply(replace_text_re) weather_data_org['HourlyVisibility'] = weather_data_org['HourlyVisibility'].astype(np.float16) weather_data_org['HourlyWindSpeed'] = weather_data_org['HourlyWindSpeed'].astype(np.int16)
def predict_weather(row): if row['HourlyPrecipitation'] < 0.1: return 0 # 降水量が少ない場合、晴れと予測 elif row['HourlyPressureChange'] > 0: return 1 # 気圧が上昇している場合、安定した天気と予測 else: return 2 # 気圧が下降している場合、不安定な天気と予測 # 天候データの特徴量追加 # 天候の予測 weather_data_org['predict_weather'] = weather_data_org.apply(predict_weather, axis=1) # 寒さ指数 weather_data_org['Wind Chill Index'] = 13.12 + 0.6215 * weather_data_org['HourlyWetBulbTemperature'] - 11.37 * weather_data_org['HourlyWindSpeed'] + 0.3965 * weather_data_org['HourlyWetBulbTemperature'] * weather_data_org['HourlyWindSpeed'] # 不快指数 T_w = weather_data_org['HourlyDewPointTemperature'] - (100 - weather_data_org['HourlyRelativeHumidity']) / 5 weather_data_org['discomfort index'] = T_w - 0.55 * (1 - weather_data_org['HourlyRelativeHumidity'] / 100) * (T_w - 58)
# 平均的な値を算出(スコアが良かった特徴量だけ) feature_cols = ['HourlyAltimeterSetting', 'HourlyPressureTendency', 'HourlyVisibility', 'HourlyStationPressure', 'predict_weather', 'Wind Chill Index', 'discomfort index', 'Year', 'Month', 'Day', 'hour'] weather_data = weather_data_org[feature_cols] group_cols = ['Year', 'Month', 'Day', 'hour'] weather_feat_df = weather_data.groupby(group_cols).mean().reset_index(drop=True) #train_dataと時間間隔を合わせる weather_feat_df = weather_feat_df.loc[weather_feat_df.index.repeat(2)].reset_index(drop=True) weather_feat_df = weather_feat_df[:len(train_data)] weather_feat_df = weather_feat_df.fillna(0) weather_cols = weather_feat_df.columns
def add_feats_function(df): # 2019年で正規化 for col in area_cols_name: Nov2017_mean = df[(df['Year'] == 2017) & (df['Month'] == 11)][col].mean() Nov2018_mean = df[(df['Year'] == 2018) & (df['Month'] == 11)][col].mean() Nov2019_mean = df[(df['Year'] == 2019) & (df['Month'] == 11)][col].mean() df[col] = pd.concat([ df[df['Year'] == 2017][col] / Nov2017_mean * Nov2019_mean, df[df['Year'] == 2018][col] / Nov2018_mean * Nov2019_mean, df[df['Year'] == 2019][col] / Nov2019_mean * Nov2019_mean ], axis = 0) # 強めのローパスフィルタをかけた波形 iir_col_list = [] for col in area_cols_name: df[f'{col}_iir'] = signal_low_pass_filter(df[col].values) iir_col_list.append(f'{col}_iir') df = df.drop(['Year', 'Month', 'Day',], axis=1) # スコアがよかった曜日単位の特徴量 group_cols = ['hour', 'min', 'weekday'] median_df = df.groupby(group_cols).transform('median').add_suffix('_weekday_median') std_df = df.groupby(group_cols).transform('std').add_suffix('_weekday_std') df = pd.concat([df, median_df, std_df], axis=1) # スコアがよかった天候関係の特徴量 for col in weather_cols: df = pd.concat([df, df[col].diff(1).fillna(method='bfill').rename(f'{col}_diff'), df[col].shift(1).fillna(method='bfill').rename(f'{col}_shift'), df[col].ewm(span = 2 * 24).mean().fillna(method='bfill').rename(f'{col}_ewm_mean24'), ], axis=1) df = df.drop(iir_col_list, axis=1) return df df_org = pd.concat([train_data.reset_index(), weather_feat_df,], axis=1) # スコアが最もよくなる範囲 df_org = df_org[(df_org['Month']==11) | ((df_org['Month']==12) & (df_org['Day']<=7))] df_org = df_org.set_index('index') df_org = add_feats_function(df_org) pd.set_option('display.max_columns', 1000) display(df_org)
5328 rows × 440 columns
! pip install -q catboost
2K ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.7/98.7 MB 4.4 MB/s eta 0:00:00 ?25h
import lightgbm as lgb from catboost import Pool from catboost import CatBoostRegressor from sklearn.ensemble import RandomForestRegressor from tqdm import tqdm min_band = '0' max_band = '78' TARGET_MIN = 0 TARGET_MAX = 78 + 1 # 感謝祭の翌週のみを学習データとする train = pd.concat([df_org['2017-11-26 00:00:00':'2017-12-02 23:30:00'], df_org['2018-11-25 00:00:00':'2018-12-01 23:30:00'],], axis=0) test = df_org[-2 * 24 * 7 :] df = pd.concat([train, test], axis=0) predict_list = [] for target in tqdm(range(TARGET_MIN, TARGET_MAX)): band_l = target - 1 band_h = target + 1 # 目的変数の抽出 if target == 0 : target_df = df.drop(columns=df.loc[:, str(band_h):max_band].columns) elif target == 78 : target_df = df.drop(columns=df.loc[:, min_band:str(band_l)].columns) else : if target == 1: target_df = df.drop([min_band], axis=1) target_df = target_df.drop(columns=df.loc[:, str(band_h):max_band].columns) elif target == 77: target_df = df.drop(columns=df.loc[:, min_band:str(band_l)].columns) target_df = target_df.drop([max_band], axis=1) else: target_df = df.drop(columns=df.loc[:, min_band:str(band_l)].columns) target_df = target_df.drop(columns=df.loc[:, str(band_h):max_band].columns) target_train = target_df[:-2 * 24 * 7] target_test = target_df[-2 * 24 * 7:] target_test = target_test.drop(str(target),axis=1) X_train = target_train.drop(str(target),axis=1).copy() # 学習用のデータフレームから説明変数を抽出 y_train = target_train[str(target)].copy() # 学習用のデータフレームから目的変数を抽出 # #light gbm lgb_model_10 = lgb.LGBMRegressor(verbosity=-1, seed=CFG.SEED, max_depth=10) lgb_model_10 = lgb_model_10.fit(X_train, y_train) lgb_predict_10 = lgb_model_10.predict(target_test) # #light gbm lgb_model_50 = lgb.LGBMRegressor(verbosity=-1, seed=CFG.SEED, max_depth=50) lgb_model_50 = lgb_model_50.fit(X_train, y_train) lgb_predict_50 = lgb_model_50.predict(target_test) # #catboost train_pool = Pool(X_train, y_train) cat_model = CatBoostRegressor(logging_level='Silent' , random_seed=CFG.SEED) cat_model = cat_model.fit(train_pool) cat_predict = cat_model.predict(target_test) # #random forest rg_model = RandomForestRegressor(n_jobs=-1, random_state=CFG.SEED) rg_model = rg_model.fit(X_train,y_train) rg_predict = rg_model.predict(target_test) predict = (lgb_predict_10 + lgb_predict_50 + cat_predict + rg_predict) / 4 # 乗車数は0未満にはならないので、後処理で修正 predict = [elem if elem > 0 else 0 for elem in predict] predict_list.append(predict)
100%|██████████| 79/79 [2:18:56<00:00, 105.53s/it]
# submitの作成 submit_df = pd.DataFrame(np.transpose(predict_list), columns=area_cols_name) submit_df = submit_df.astype(int) submit_df.columns = area_cols_name submit_df.index = pd.date_range(start='2019/12/01 00:00:00', end='2019/12/07 23:30:00', freq='30T') submit_df.index.name = 'tpep_pickup_datetime' submit_df.to_csv(PATHS.SUBMIT_DATA) display(submit_df.describe())