タクシー需要予測

天気データの使用方法

予測対象の時間に対して、何時間前のデータまで使えるのかをご教授ください
該当時刻のギリギリ直前までの使用が可能でしょうか？

import pandas as pd
pd.options.display.max_columns = 100
pd.options.display.max_rows = 999
pd.options.display.float_format = '{:.6f}'.format
import numpy as np

import datetime

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# %matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

path = "/content/drive/MyDrive/tmp/7_probspace/3_taxi/"

weather = pd.read_csv(path+'data/nyc_weather_2017_2019.csv', parse_dates=["DATE"]).rename(columns={'DATE':'ds'})
train = pd.read_csv(path+'data/train_data.csv', parse_dates=["tpep_pickup_datetime"]).rename(columns={'tpep_pickup_datetime':'ds'})

train[train["ds"]>="2019-11-30"].head(5)

	ds	0	1	2	3	4	5	7	8	9	10	11	12	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	66	67	68	69	70	71	72	73	74	75	76	77	78
51024	2019-11-30 00:00:00	5.000000	6.000000	6.000000	3.000000	7.000000	4.000000	18.000000	10.000000	13.000000	11.000000	157.000000	37.000000	2.000000	1.000000	86.000000	14.000000	23.000000	161.000000	17.000000	10.000000	62.000000	6.000000	86.000000	89.000000	51.000000	105.000000	7.000000	13.000000	1.000000	87.000000	39.000000	46.000000	18.000000	58.000000	83.000000	14.000000	58.000000	4.000000	4.000000	115.000000	13.000000	7.000000	48.000000	101.000000	52.000000	101.000000	110.000000	18.000000	52.000000	0.000000	4.000000	83.000000	0.000000	9.000000	41.000000	2.000000	1.000000	3.000000	55.000000	187.000000	33.000000	10.000000	26.000000	81.000000	30.000000	46.000000	33.000000	44.000000	3.000000	27.000000	140.000000	1.000000	0.000000	1.000000	4.000000	10.000000	43.000000
51025	2019-11-30 00:30:00	11.000000	5.000000	5.000000	5.000000	3.000000	1.000000	13.000000	6.000000	10.000000	6.000000	138.000000	18.000000	2.000000	0.000000	66.000000	13.000000	20.000000	176.000000	9.000000	13.000000	66.000000	4.000000	52.000000	61.000000	38.000000	84.000000	2.000000	15.000000	0.000000	106.000000	22.000000	4.000000	20.000000	37.000000	67.000000	10.000000	66.000000	2.000000	5.000000	123.000000	5.000000	3.000000	65.000000	51.000000	32.000000	66.000000	104.000000	10.000000	47.000000	1.000000	3.000000	45.000000	7.000000	7.000000	30.000000	1.000000	4.000000	5.000000	36.000000	141.000000	38.000000	7.000000	22.000000	79.000000	22.000000	39.000000	27.000000	56.000000	3.000000	31.000000	124.000000	5.000000	0.000000	2.000000	7.000000	6.000000	48.000000
51026	2019-11-30 01:00:00	9.000000	7.000000	6.000000	6.000000	5.000000	1.000000	12.000000	4.000000	10.000000	11.000000	118.000000	25.000000	2.000000	0.000000	62.000000	7.000000	21.000000	168.000000	8.000000	5.000000	61.000000	1.000000	31.000000	48.000000	27.000000	89.000000	6.000000	23.000000	4.000000	50.000000	31.000000	0.000000	12.000000	37.000000	49.000000	3.000000	42.000000	3.000000	2.000000	141.000000	14.000000	3.000000	87.000000	35.000000	27.000000	45.000000	60.000000	8.000000	45.000000	4.000000	4.000000	76.000000	0.000000	11.000000	31.000000	0.000000	2.000000	3.000000	39.000000	107.000000	42.000000	7.000000	15.000000	56.000000	20.000000	36.000000	24.000000	27.000000	3.000000	31.000000	107.000000	5.000000	2.000000	1.000000	4.000000	5.000000	38.000000
51027	2019-11-30 01:30:00	10.000000	5.000000	4.000000	6.000000	1.000000	1.000000	8.000000	3.000000	7.000000	9.000000	101.000000	13.000000	2.000000	0.000000	79.000000	5.000000	9.000000	172.000000	6.000000	3.000000	56.000000	2.000000	44.000000	44.000000	29.000000	114.000000	3.000000	16.000000	5.000000	20.000000	20.000000	0.000000	7.000000	39.000000	29.000000	4.000000	59.000000	2.000000	2.000000	136.000000	7.000000	2.000000	89.000000	28.000000	31.000000	42.000000	66.000000	6.000000	25.000000	2.000000	2.000000	70.000000	4.000000	4.000000	33.000000	1.000000	3.000000	7.000000	31.000000	74.000000	37.000000	10.000000	15.000000	49.000000	21.000000	29.000000	21.000000	26.000000	1.000000	43.000000	95.000000	4.000000	7.000000	1.000000	4.000000	2.000000	31.000000
51028	2019-11-30 02:00:00	13.000000	9.000000	0.000000	4.000000	3.000000	1.000000	6.000000	7.000000	7.000000	5.000000	83.000000	15.000000	3.000000	0.000000	68.000000	5.000000	5.000000	131.000000	8.000000	3.000000	49.000000	0.000000	32.000000	44.000000	26.000000	103.000000	2.000000	15.000000	2.000000	21.000000	12.000000	0.000000	6.000000	35.000000	17.000000	4.000000	69.000000	3.000000	3.000000	161.000000	6.000000	0.000000	67.000000	14.000000	20.000000	34.000000	80.000000	5.000000	27.000000	2.000000	3.000000	44.000000	0.000000	7.000000	18.000000	1.000000	1.000000	9.000000	25.000000	83.000000	30.000000	9.000000	13.000000	39.000000	14.000000	9.000000	18.000000	9.000000	3.000000	67.000000	85.000000	7.000000	4.000000	3.000000	5.000000	3.000000	28.000000

weather[weather["ds"]<="2019-11-30 02:00:00"].tail(5)

	ds	REPORT_TYPE	SOURCE	HourlyAltimeterSetting	HourlyDewPointTemperature	HourlyDryBulbTemperature	HourlyPrecipitation	HourlyPresentWeatherType	HourlyPressureChange	HourlyPressureTendency	HourlyRelativeHumidity	HourlySkyConditions	HourlySeaLevelPressure	HourlyStationPressure	HourlyVisibility	HourlyWetBulbTemperature	HourlyWindDirection	HourlyWindGustSpeed	HourlyWindSpeed	REM
39951	2019-11-29 22:51:00	FM-15	7	30.11	23	37	0.00	NaN	NaN	NaN	57.000000	BKN:07 140 OVC:08 250	30.11	30.08	10.00	32.000000	340	NaN	11.000000	MET10111/29/19 22:51:01 METAR KLGA 300351Z 340...
39952	2019-11-29 23:51:00	FM-15	7	30.09	22	36	0.00	NaN	NaN	NaN	57.000000	OVC:08 150	30.09	30.06	10.00	31.000000	350	NaN	10.000000	MET10411/29/19 23:51:02 METAR KLGA 300451Z 350...
39953	2019-11-30 00:51:00	FM-15	7	30.08	21	36	0.00	NaN	0.040000	6.000000	55.000000	OVC:08 150	30.08	30.05	10.00	31.000000	360	NaN	11.000000	MET11211/30/19 00:51:02 METAR KLGA 300551Z 360...
39954	2019-11-30 01:00:00	FM-12	4	NaN	21	36	NaN	NaN	0.040000	6.000000	55.000000	NaN	30.08	30.04	9.94	31.000000	360	NaN	11.000000	SYN08072503 32966 83610 10022 21061 30172 4018...
39955	2019-11-30 01:51:00	FM-15	7	30.08	20	36	0.00	NaN	NaN	NaN	52.000000	OVC:08 150	30.08	30.05	10.00	30.000000	350	NaN	15.000000	MET09911/30/19 01:51:02 METAR KLGA 300651Z 350...

上記データを例として、2019-11-30 01:00:00の予測をする時

天気情報の39,953行目の「2019-11-30 00:51:00」までのデータを使用するのはOKでしょうか。
天気情報の39,954行目の「2019-11-30 01:00:00」以降のデータを使用するのはNGでしょうか。
ご回答よろしくお願いいたします。

〜〜質問はここまで〜〜

以下、直前のデータまで使用できると仮定した場合の集計と可視化サンプル置いておきます。
非常に雑な集計で申し訳ありません。ご参考までに。

# 縦持ち
data_df = train.copy()
data_df = pd.melt(
    data_df, id_vars=["ds"], value_vars=list(map(str, range(79))),
    var_name="id", value_name="y"
)
data_df.loc[:,"id"] = data_df["id"].astype("int64")

# お天気情報merge用の列追加
data_df.loc[:,"ds_h"] = pd.to_datetime(data_df["ds"].dt.strftime('%Y-%m-%d %H')+":00:00")
data_df

	ds	id	y	ds_h
0	2017-01-01 00:00:00	0	53.000000	2017-01-01 00:00:00
1	2017-01-01 00:30:00	0	83.000000	2017-01-01 00:00:00
2	2017-01-01 01:00:00	0	69.000000	2017-01-01 01:00:00
3	2017-01-01 01:30:00	0	76.000000	2017-01-01 01:00:00
4	2017-01-01 02:00:00	0	101.000000	2017-01-01 02:00:00
...	...	...	...	...
4034683	2019-11-30 21:30:00	78	75.000000	2019-11-30 21:00:00
4034684	2019-11-30 22:00:00	78	85.000000	2019-11-30 22:00:00
4034685	2019-11-30 22:30:00	78	72.000000	2019-11-30 22:00:00
4034686	2019-11-30 23:00:00	78	88.000000	2019-11-30 23:00:00
4034687	2019-11-30 23:30:00	78	78.000000	2019-11-30 23:00:00

4034688 rows × 4 columns

weather.loc[:,"year"] = weather["ds"].dt.year
weather.loc[:,"month"] = weather["ds"].dt.month
weather.loc[:,"day"] = weather["ds"].dt.day
weather.loc[:,"hour"] = weather["ds"].dt.hour
weather.loc[:,"minute"] = weather["ds"].dt.minute

weather.loc[:,"ds_h"] = pd.to_datetime(
    weather["year"].astype(str) +"-"+ weather["month"].astype(str).str.zfill(2) +"-"+ weather["day"].astype(str).str.zfill(2) +
    " "+ weather["hour"].astype(str).str.zfill(2) + ":00:00"
)
# 全部一時間後に参照できるようにする
weather.loc[:,"ds_h"] = weather["ds_h"] + datetime.timedelta(hours=+1)
weather[["ds","year","month","day","hour","minute","ds_h"]].head(10)

	ds	year	month	day	hour	minute	ds_h
0	2017-01-01 00:51:00	2017	1	1	0	51	2017-01-01 01:00:00
1	2017-01-01 01:00:00	2017	1	1	1	0	2017-01-01 02:00:00
2	2017-01-01 01:51:00	2017	1	1	1	51	2017-01-01 02:00:00
3	2017-01-01 02:51:00	2017	1	1	2	51	2017-01-01 03:00:00
4	2017-01-01 03:51:00	2017	1	1	3	51	2017-01-01 04:00:00
5	2017-01-01 04:00:00	2017	1	1	4	0	2017-01-01 05:00:00
6	2017-01-01 04:51:00	2017	1	1	4	51	2017-01-01 05:00:00
7	2017-01-01 05:51:00	2017	1	1	5	51	2017-01-01 06:00:00
8	2017-01-01 06:51:00	2017	1	1	6	51	2017-01-01 07:00:00
9	2017-01-01 07:00:00	2017	1	1	7	0	2017-01-01 08:00:00

col_weather = [
  "HourlyDewPointTemperature","HourlyDryBulbTemperature","HourlyPrecipitation","HourlyPressureChange","HourlyPressureTendency","HourlyRelativeHumidity",
  "HourlySkyConditions","HourlySeaLevelPressure","HourlyStationPressure","HourlyVisibility","HourlyWetBulbTemperature","HourlyWindDirection","HourlyWindGustSpeed","HourlyWindSpeed",
]
# 強引にfloat変換（変換できないところはNA）
for c in col_weather:
  weather.loc[:,c] = pd.to_numeric(weather[c], errors="coerce")

# 時間ごとに平均
weather_ = pd.DataFrame({
  "ds_h":pd.date_range(start='2017-01-01 00:00:00', end='2019-12-07 23:00:00', freq='60T')
})
weather_ = pd.merge(
    weather_,
    weather[["ds_h"]+col_weather].groupby(["ds_h"]).mean().fillna(0).reset_index(),
    how="left", on="ds_h"
).set_index("ds_h")
# NAは線形補完
weather_ = weather_.interpolate(method='linear', axis=0)
weather_ = weather_.reset_index()
weather_.head(5)
# 一番最初のデータがなくなったけど、まぁいいか

	ds_h	HourlyDewPointTemperature	HourlyDryBulbTemperature	HourlyPrecipitation	HourlyPressureChange	HourlyPressureTendency	HourlyRelativeHumidity	HourlySkyConditions	HourlySeaLevelPressure	HourlyStationPressure	HourlyVisibility	HourlyWetBulbTemperature	HourlyWindDirection	HourlyWindGustSpeed	HourlyWindSpeed
0	2017-01-01 00:00:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2017-01-01 01:00:00	25.000000	46.000000	0.000000	0.020000	6.000000	44.000000	0.000000	29.890000	29.860000	10.000000	38.000000	230.000000	0.000000	10.000000
2	2017-01-01 02:00:00	26.500000	45.500000	0.000000	0.020000	6.000000	48.000000	41.000000	29.895000	29.860000	9.970000	38.000000	230.000000	0.000000	9.000000
3	2017-01-01 03:00:00	26.000000	46.000000	0.000000	0.000000	0.000000	46.000000	0.000000	29.910000	29.880000	10.000000	38.000000	240.000000	0.000000	13.000000
4	2017-01-01 04:00:00	25.000000	46.000000	0.000000	-0.040000	3.000000	44.000000	0.000000	29.930000	29.900000	10.000000	38.000000	240.000000	25.000000	10.000000

data_df_w = pd.merge(data_df, weather_, how="left", on="ds_h")
data_df_w.head(5)

	ds	y	ds_h	HourlyDewPointTemperature	HourlyDryBulbTemperature	HourlyPrecipitation	HourlyPressureChange	HourlyPressureTendency	HourlyRelativeHumidity	HourlySkyConditions	HourlySeaLevelPressure	HourlyStationPressure	HourlyVisibility	HourlyWetBulbTemperature	HourlyWindDirection	HourlyWindGustSpeed	HourlyWindSpeed
0	2017-01-01 00:00:00	53.000000	2017-01-01 00:00:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2017-01-01 00:30:00	83.000000	2017-01-01 00:00:00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	2017-01-01 01:00:00	69.000000	2017-01-01 01:00:00	25.000000	46.000000	0.000000	0.020000	6.000000	44.000000	0.000000	29.890000	29.860000	10.000000	38.000000	230.000000	0.000000	10.000000
3	2017-01-01 01:30:00	76.000000	2017-01-01 01:00:00	25.000000	46.000000	0.000000	0.020000	6.000000	44.000000	0.000000	29.890000	29.860000	10.000000	38.000000	230.000000	0.000000	10.000000
4	2017-01-01 02:00:00	101.000000	2017-01-01 02:00:00	26.500000	45.500000	0.000000	0.020000	6.000000	48.000000	41.000000	29.895000	29.860000	9.970000	38.000000	230.000000	0.000000	9.000000

# なんとなくこの期間を可視化
plot_d = data_df_w[(data_df_w["ds"]>="2018-11-08")&(data_df_w["ds"]<="2018-11-20")]

fig, ax = plt.subplots(len(col_weather), 1, figsize=(20, len(col_weather)*4))

i = 31 # なんとなく31地点を可視化（空港？）
for j, col in enumerate(col_weather):
  ax2 = ax[j].twinx()
  ax[j].plot(plot_d[plot_d["id"]==i]['ds'], plot_d[plot_d["id"]==i]["y"], color=cm.tab20.colors[0])
  ax2.plot(plot_d[plot_d["id"]==i]['ds'], plot_d[plot_d["id"]==i][col], color=cm.tab20.colors[j+1])
  ax[j].set_title(str(i)+"_"+col)

  # 色変更
  ax2.spines['left'].set_color(cm.tab20.colors[0])
  ax2.spines['right'].set_color(cm.tab20.colors[j+1])
  ax[j].tick_params(axis='y', colors=cm.tab20.colors[0])
  ax2.tick_params(axis='y', colors=cm.tab20.colors[j+1])

plt.show()
plt.close()
plt.clf()

Output hidden; open in https://colab.research.google.com to view.

タクシー需要予測

【運営担当者様】気象データ(nyc_weather_2017_2019)の使用方法についてのご質問

天気データの使用方法

添付データ

ProbSpace_official

mini_forest

ProbSpace_official

new user