mini_forest
import pandas as pd
pd.options.display.max_columns = 100
pd.options.display.max_rows = 999
pd.options.display.float_format = '{:.6f}'.format
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# %matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
path = "/content/drive/MyDrive/tmp/7_probspace/3_taxi/"
weather = pd.read_csv(path+'data/nyc_weather_2017_2019.csv', parse_dates=["DATE"]).rename(columns={'DATE':'ds'})
train = pd.read_csv(path+'data/train_data.csv', parse_dates=["tpep_pickup_datetime"]).rename(columns={'tpep_pickup_datetime':'ds'})
train[train["ds"]>="2019-11-30"].head(5)
ds | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
51024 | 2019-11-30 00:00:00 | 5.000000 | 6.000000 | 6.000000 | 3.000000 | 7.000000 | 4.000000 | 0.000000 | 18.000000 | 10.000000 | 13.000000 | 11.000000 | 157.000000 | 37.000000 | 0.000000 | 2.000000 | 1.000000 | 86.000000 | 14.000000 | 23.000000 | 161.000000 | 17.000000 | 10.000000 | 62.000000 | 6.000000 | 86.000000 | 89.000000 | 51.000000 | 105.000000 | 7.000000 | 13.000000 | 1.000000 | 87.000000 | 39.000000 | 46.000000 | 18.000000 | 58.000000 | 83.000000 | 14.000000 | 58.000000 | 4.000000 | 4.000000 | 115.000000 | 13.000000 | 7.000000 | 48.000000 | 101.000000 | 52.000000 | 101.000000 | 110.000000 | 18.000000 | 52.000000 | 0.000000 | 4.000000 | 83.000000 | 0.000000 | 9.000000 | 41.000000 | 2.000000 | 1.000000 | 3.000000 | 55.000000 | 187.000000 | 33.000000 | 10.000000 | 26.000000 | 81.000000 | 30.000000 | 46.000000 | 33.000000 | 44.000000 | 3.000000 | 27.000000 | 140.000000 | 1.000000 | 0.000000 | 1.000000 | 4.000000 | 10.000000 | 43.000000 |
51025 | 2019-11-30 00:30:00 | 11.000000 | 5.000000 | 5.000000 | 5.000000 | 3.000000 | 1.000000 | 0.000000 | 13.000000 | 6.000000 | 10.000000 | 6.000000 | 138.000000 | 18.000000 | 0.000000 | 2.000000 | 0.000000 | 66.000000 | 13.000000 | 20.000000 | 176.000000 | 9.000000 | 13.000000 | 66.000000 | 4.000000 | 52.000000 | 61.000000 | 38.000000 | 84.000000 | 2.000000 | 15.000000 | 0.000000 | 106.000000 | 22.000000 | 4.000000 | 20.000000 | 37.000000 | 67.000000 | 10.000000 | 66.000000 | 2.000000 | 5.000000 | 123.000000 | 5.000000 | 3.000000 | 65.000000 | 51.000000 | 32.000000 | 66.000000 | 104.000000 | 10.000000 | 47.000000 | 1.000000 | 3.000000 | 45.000000 | 7.000000 | 7.000000 | 30.000000 | 1.000000 | 4.000000 | 5.000000 | 36.000000 | 141.000000 | 38.000000 | 7.000000 | 22.000000 | 79.000000 | 22.000000 | 39.000000 | 27.000000 | 56.000000 | 3.000000 | 31.000000 | 124.000000 | 5.000000 | 0.000000 | 2.000000 | 7.000000 | 6.000000 | 48.000000 |
51026 | 2019-11-30 01:00:00 | 9.000000 | 7.000000 | 6.000000 | 6.000000 | 5.000000 | 1.000000 | 0.000000 | 12.000000 | 4.000000 | 10.000000 | 11.000000 | 118.000000 | 25.000000 | 0.000000 | 2.000000 | 0.000000 | 62.000000 | 7.000000 | 21.000000 | 168.000000 | 8.000000 | 5.000000 | 61.000000 | 1.000000 | 31.000000 | 48.000000 | 27.000000 | 89.000000 | 6.000000 | 23.000000 | 4.000000 | 50.000000 | 31.000000 | 0.000000 | 12.000000 | 37.000000 | 49.000000 | 3.000000 | 42.000000 | 3.000000 | 2.000000 | 141.000000 | 14.000000 | 3.000000 | 87.000000 | 35.000000 | 27.000000 | 45.000000 | 60.000000 | 8.000000 | 45.000000 | 4.000000 | 4.000000 | 76.000000 | 0.000000 | 11.000000 | 31.000000 | 0.000000 | 2.000000 | 3.000000 | 39.000000 | 107.000000 | 42.000000 | 7.000000 | 15.000000 | 56.000000 | 20.000000 | 36.000000 | 24.000000 | 27.000000 | 3.000000 | 31.000000 | 107.000000 | 5.000000 | 2.000000 | 1.000000 | 4.000000 | 5.000000 | 38.000000 |
51027 | 2019-11-30 01:30:00 | 10.000000 | 5.000000 | 4.000000 | 6.000000 | 1.000000 | 1.000000 | 0.000000 | 8.000000 | 3.000000 | 7.000000 | 9.000000 | 101.000000 | 13.000000 | 0.000000 | 2.000000 | 0.000000 | 79.000000 | 5.000000 | 9.000000 | 172.000000 | 6.000000 | 3.000000 | 56.000000 | 2.000000 | 44.000000 | 44.000000 | 29.000000 | 114.000000 | 3.000000 | 16.000000 | 5.000000 | 20.000000 | 20.000000 | 0.000000 | 7.000000 | 39.000000 | 29.000000 | 4.000000 | 59.000000 | 2.000000 | 2.000000 | 136.000000 | 7.000000 | 2.000000 | 89.000000 | 28.000000 | 31.000000 | 42.000000 | 66.000000 | 6.000000 | 25.000000 | 2.000000 | 2.000000 | 70.000000 | 4.000000 | 4.000000 | 33.000000 | 1.000000 | 3.000000 | 7.000000 | 31.000000 | 74.000000 | 37.000000 | 10.000000 | 15.000000 | 49.000000 | 21.000000 | 29.000000 | 21.000000 | 26.000000 | 1.000000 | 43.000000 | 95.000000 | 4.000000 | 7.000000 | 1.000000 | 4.000000 | 2.000000 | 31.000000 |
51028 | 2019-11-30 02:00:00 | 13.000000 | 9.000000 | 0.000000 | 4.000000 | 3.000000 | 1.000000 | 0.000000 | 6.000000 | 7.000000 | 7.000000 | 5.000000 | 83.000000 | 15.000000 | 0.000000 | 3.000000 | 0.000000 | 68.000000 | 5.000000 | 5.000000 | 131.000000 | 8.000000 | 3.000000 | 49.000000 | 0.000000 | 32.000000 | 44.000000 | 26.000000 | 103.000000 | 2.000000 | 15.000000 | 2.000000 | 21.000000 | 12.000000 | 0.000000 | 6.000000 | 35.000000 | 17.000000 | 4.000000 | 69.000000 | 3.000000 | 3.000000 | 161.000000 | 6.000000 | 0.000000 | 67.000000 | 14.000000 | 20.000000 | 34.000000 | 80.000000 | 5.000000 | 27.000000 | 2.000000 | 3.000000 | 44.000000 | 0.000000 | 7.000000 | 18.000000 | 1.000000 | 1.000000 | 9.000000 | 25.000000 | 83.000000 | 30.000000 | 9.000000 | 13.000000 | 39.000000 | 14.000000 | 9.000000 | 18.000000 | 9.000000 | 3.000000 | 67.000000 | 85.000000 | 7.000000 | 4.000000 | 3.000000 | 5.000000 | 3.000000 | 28.000000 |
weather[weather["ds"]<="2019-11-30 02:00:00"].tail(5)
ds | REPORT_TYPE | SOURCE | HourlyAltimeterSetting | HourlyDewPointTemperature | HourlyDryBulbTemperature | HourlyPrecipitation | HourlyPresentWeatherType | HourlyPressureChange | HourlyPressureTendency | HourlyRelativeHumidity | HourlySkyConditions | HourlySeaLevelPressure | HourlyStationPressure | HourlyVisibility | HourlyWetBulbTemperature | HourlyWindDirection | HourlyWindGustSpeed | HourlyWindSpeed | REM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
39951 | 2019-11-29 22:51:00 | FM-15 | 7 | 30.11 | 23 | 37 | 0.00 | NaN | NaN | NaN | 57.000000 | BKN:07 140 OVC:08 250 | 30.11 | 30.08 | 10.00 | 32.000000 | 340 | NaN | 11.000000 | MET10111/29/19 22:51:01 METAR KLGA 300351Z 340... |
39952 | 2019-11-29 23:51:00 | FM-15 | 7 | 30.09 | 22 | 36 | 0.00 | NaN | NaN | NaN | 57.000000 | OVC:08 150 | 30.09 | 30.06 | 10.00 | 31.000000 | 350 | NaN | 10.000000 | MET10411/29/19 23:51:02 METAR KLGA 300451Z 350... |
39953 | 2019-11-30 00:51:00 | FM-15 | 7 | 30.08 | 21 | 36 | 0.00 | NaN | 0.040000 | 6.000000 | 55.000000 | OVC:08 150 | 30.08 | 30.05 | 10.00 | 31.000000 | 360 | NaN | 11.000000 | MET11211/30/19 00:51:02 METAR KLGA 300551Z 360... |
39954 | 2019-11-30 01:00:00 | FM-12 | 4 | NaN | 21 | 36 | NaN | NaN | 0.040000 | 6.000000 | 55.000000 | NaN | 30.08 | 30.04 | 9.94 | 31.000000 | 360 | NaN | 11.000000 | SYN08072503 32966 83610 10022 21061 30172 4018... |
39955 | 2019-11-30 01:51:00 | FM-15 | 7 | 30.08 | 20 | 36 | 0.00 | NaN | NaN | NaN | 52.000000 | OVC:08 150 | 30.08 | 30.05 | 10.00 | 30.000000 | 350 | NaN | 15.000000 | MET09911/30/19 01:51:02 METAR KLGA 300651Z 350... |
上記データを例として、2019-11-30 01:00:00の予測をする時
〜〜質問はここまで〜〜
以下、直前のデータまで使用できると仮定した場合の集計と可視化サンプル置いておきます。
非常に雑な集計で申し訳ありません。ご参考までに。
# 縦持ち
data_df = train.copy()
data_df = pd.melt(
data_df, id_vars=["ds"], value_vars=list(map(str, range(79))),
var_name="id", value_name="y"
)
data_df.loc[:,"id"] = data_df["id"].astype("int64")
# お天気情報merge用の列追加
data_df.loc[:,"ds_h"] = pd.to_datetime(data_df["ds"].dt.strftime('%Y-%m-%d %H')+":00:00")
data_df
ds | id | y | ds_h | |
---|---|---|---|---|
0 | 2017-01-01 00:00:00 | 0 | 53.000000 | 2017-01-01 00:00:00 |
1 | 2017-01-01 00:30:00 | 0 | 83.000000 | 2017-01-01 00:00:00 |
2 | 2017-01-01 01:00:00 | 0 | 69.000000 | 2017-01-01 01:00:00 |
3 | 2017-01-01 01:30:00 | 0 | 76.000000 | 2017-01-01 01:00:00 |
4 | 2017-01-01 02:00:00 | 0 | 101.000000 | 2017-01-01 02:00:00 |
... | ... | ... | ... | ... |
4034683 | 2019-11-30 21:30:00 | 78 | 75.000000 | 2019-11-30 21:00:00 |
4034684 | 2019-11-30 22:00:00 | 78 | 85.000000 | 2019-11-30 22:00:00 |
4034685 | 2019-11-30 22:30:00 | 78 | 72.000000 | 2019-11-30 22:00:00 |
4034686 | 2019-11-30 23:00:00 | 78 | 88.000000 | 2019-11-30 23:00:00 |
4034687 | 2019-11-30 23:30:00 | 78 | 78.000000 | 2019-11-30 23:00:00 |
4034688 rows × 4 columns
weather.loc[:,"year"] = weather["ds"].dt.year
weather.loc[:,"month"] = weather["ds"].dt.month
weather.loc[:,"day"] = weather["ds"].dt.day
weather.loc[:,"hour"] = weather["ds"].dt.hour
weather.loc[:,"minute"] = weather["ds"].dt.minute
weather.loc[:,"ds_h"] = pd.to_datetime(
weather["year"].astype(str) +"-"+ weather["month"].astype(str).str.zfill(2) +"-"+ weather["day"].astype(str).str.zfill(2) +
" "+ weather["hour"].astype(str).str.zfill(2) + ":00:00"
)
# 全部一時間後に参照できるようにする
weather.loc[:,"ds_h"] = weather["ds_h"] + datetime.timedelta(hours=+1)
weather[["ds","year","month","day","hour","minute","ds_h"]].head(10)
ds | year | month | day | hour | minute | ds_h | |
---|---|---|---|---|---|---|---|
0 | 2017-01-01 00:51:00 | 2017 | 1 | 1 | 0 | 51 | 2017-01-01 01:00:00 |
1 | 2017-01-01 01:00:00 | 2017 | 1 | 1 | 1 | 0 | 2017-01-01 02:00:00 |
2 | 2017-01-01 01:51:00 | 2017 | 1 | 1 | 1 | 51 | 2017-01-01 02:00:00 |
3 | 2017-01-01 02:51:00 | 2017 | 1 | 1 | 2 | 51 | 2017-01-01 03:00:00 |
4 | 2017-01-01 03:51:00 | 2017 | 1 | 1 | 3 | 51 | 2017-01-01 04:00:00 |
5 | 2017-01-01 04:00:00 | 2017 | 1 | 1 | 4 | 0 | 2017-01-01 05:00:00 |
6 | 2017-01-01 04:51:00 | 2017 | 1 | 1 | 4 | 51 | 2017-01-01 05:00:00 |
7 | 2017-01-01 05:51:00 | 2017 | 1 | 1 | 5 | 51 | 2017-01-01 06:00:00 |
8 | 2017-01-01 06:51:00 | 2017 | 1 | 1 | 6 | 51 | 2017-01-01 07:00:00 |
9 | 2017-01-01 07:00:00 | 2017 | 1 | 1 | 7 | 0 | 2017-01-01 08:00:00 |
col_weather = [
"HourlyDewPointTemperature","HourlyDryBulbTemperature","HourlyPrecipitation","HourlyPressureChange","HourlyPressureTendency","HourlyRelativeHumidity",
"HourlySkyConditions","HourlySeaLevelPressure","HourlyStationPressure","HourlyVisibility","HourlyWetBulbTemperature","HourlyWindDirection","HourlyWindGustSpeed","HourlyWindSpeed",
]
# 強引にfloat変換(変換できないところはNA)
for c in col_weather:
weather.loc[:,c] = pd.to_numeric(weather[c], errors="coerce")
# 時間ごとに平均
weather_ = pd.DataFrame({
"ds_h":pd.date_range(start='2017-01-01 00:00:00', end='2019-12-07 23:00:00', freq='60T')
})
weather_ = pd.merge(
weather_,
weather[["ds_h"]+col_weather].groupby(["ds_h"]).mean().fillna(0).reset_index(),
how="left", on="ds_h"
).set_index("ds_h")
# NAは線形補完
weather_ = weather_.interpolate(method='linear', axis=0)
weather_ = weather_.reset_index()
weather_.head(5)
# 一番最初のデータがなくなったけど、まぁいいか
ds_h | HourlyDewPointTemperature | HourlyDryBulbTemperature | HourlyPrecipitation | HourlyPressureChange | HourlyPressureTendency | HourlyRelativeHumidity | HourlySkyConditions | HourlySeaLevelPressure | HourlyStationPressure | HourlyVisibility | HourlyWetBulbTemperature | HourlyWindDirection | HourlyWindGustSpeed | HourlyWindSpeed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017-01-01 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2017-01-01 01:00:00 | 25.000000 | 46.000000 | 0.000000 | 0.020000 | 6.000000 | 44.000000 | 0.000000 | 29.890000 | 29.860000 | 10.000000 | 38.000000 | 230.000000 | 0.000000 | 10.000000 |
2 | 2017-01-01 02:00:00 | 26.500000 | 45.500000 | 0.000000 | 0.020000 | 6.000000 | 48.000000 | 41.000000 | 29.895000 | 29.860000 | 9.970000 | 38.000000 | 230.000000 | 0.000000 | 9.000000 |
3 | 2017-01-01 03:00:00 | 26.000000 | 46.000000 | 0.000000 | 0.000000 | 0.000000 | 46.000000 | 0.000000 | 29.910000 | 29.880000 | 10.000000 | 38.000000 | 240.000000 | 0.000000 | 13.000000 |
4 | 2017-01-01 04:00:00 | 25.000000 | 46.000000 | 0.000000 | -0.040000 | 3.000000 | 44.000000 | 0.000000 | 29.930000 | 29.900000 | 10.000000 | 38.000000 | 240.000000 | 25.000000 | 10.000000 |
data_df_w = pd.merge(data_df, weather_, how="left", on="ds_h")
data_df_w.head(5)
ds | id | y | ds_h | HourlyDewPointTemperature | HourlyDryBulbTemperature | HourlyPrecipitation | HourlyPressureChange | HourlyPressureTendency | HourlyRelativeHumidity | HourlySkyConditions | HourlySeaLevelPressure | HourlyStationPressure | HourlyVisibility | HourlyWetBulbTemperature | HourlyWindDirection | HourlyWindGustSpeed | HourlyWindSpeed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017-01-01 00:00:00 | 0 | 53.000000 | 2017-01-01 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2017-01-01 00:30:00 | 0 | 83.000000 | 2017-01-01 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2017-01-01 01:00:00 | 0 | 69.000000 | 2017-01-01 01:00:00 | 25.000000 | 46.000000 | 0.000000 | 0.020000 | 6.000000 | 44.000000 | 0.000000 | 29.890000 | 29.860000 | 10.000000 | 38.000000 | 230.000000 | 0.000000 | 10.000000 |
3 | 2017-01-01 01:30:00 | 0 | 76.000000 | 2017-01-01 01:00:00 | 25.000000 | 46.000000 | 0.000000 | 0.020000 | 6.000000 | 44.000000 | 0.000000 | 29.890000 | 29.860000 | 10.000000 | 38.000000 | 230.000000 | 0.000000 | 10.000000 |
4 | 2017-01-01 02:00:00 | 0 | 101.000000 | 2017-01-01 02:00:00 | 26.500000 | 45.500000 | 0.000000 | 0.020000 | 6.000000 | 48.000000 | 41.000000 | 29.895000 | 29.860000 | 9.970000 | 38.000000 | 230.000000 | 0.000000 | 9.000000 |
# なんとなくこの期間を可視化
plot_d = data_df_w[(data_df_w["ds"]>="2018-11-08")&(data_df_w["ds"]<="2018-11-20")]
fig, ax = plt.subplots(len(col_weather), 1, figsize=(20, len(col_weather)*4))
i = 31 # なんとなく31地点を可視化(空港?)
for j, col in enumerate(col_weather):
ax2 = ax[j].twinx()
ax[j].plot(plot_d[plot_d["id"]==i]['ds'], plot_d[plot_d["id"]==i]["y"], color=cm.tab20.colors[0])
ax2.plot(plot_d[plot_d["id"]==i]['ds'], plot_d[plot_d["id"]==i][col], color=cm.tab20.colors[j+1])
ax[j].set_title(str(i)+"_"+col)
# 色変更
ax2.spines['left'].set_color(cm.tab20.colors[0])
ax2.spines['right'].set_color(cm.tab20.colors[j+1])
ax[j].tick_params(axis='y', colors=cm.tab20.colors[0])
ax2.tick_params(axis='y', colors=cm.tab20.colors[j+1])
plt.show()
plt.close()
plt.clf()
Output hidden; open in https://colab.research.google.com to view.
ProbSpace_official
mini_forest様
ご質問いただき、ありがとうございます。ProbSpace運営事務局です。
天気データに関してですが、予測の際に、未来情報をご使用されることは問題ございません。
タクシーの利用判断におきましては、直近の天気予報情報が参考として用いられる可能性がございます。
提供されているデータは実績値であり、予報情報とは異なりますが、近似的な意味での参考データとしてご使用いただけます。
ご確認のほど、よろしくお願い申し上げます。