# 目的変数の設定
## 一週間前(7*48行前)の同時刻との差分
data_df.loc[:,"y_diff"] = data_df["y"] - data_df[["id", "y"]].groupby("id")["y"].transform(lambda x: x.shift(periods=48*7))
# 特徴量追加
data_df.loc[:,"area"] = data_df["id"].astype("float")
## 時間関連
data_df.loc[:,"hour_minute"] = data_df["ds"].dt.strftime('%H%M')
data_df.loc[:,"ds_h"] = pd.to_datetime(data_df["ds"].dt.strftime('%Y-%m-%d %H')+":00:00")
data_df.loc[:,"ds_ymd"] = pd.to_datetime(data_df["ds"].dt.strftime('%Y-%m-%d ')+"00:00:00")
data_df.loc[:,"year"] = data_df["ds"].dt.year
data_df.loc[:,"month"] = data_df["ds"].dt.month
data_df.loc[:,"day"] = data_df["ds"].dt.day
data_df.loc[:,"hour"] = data_df["ds"].dt.hour
data_df.loc[:,"yobi"] = data_df["ds"].dt.weekday
data_df.loc[:,"week_number_year"] = data_df["ds"].dt.strftime('%U').astype(int) # 年における第何週か
# 休日フラグ
list_Legal_Holidays = [
# New Year's # Dr Martin # Lincoln's # Washington # Memorial # Independence # LaborDay #Columbus # Election # Veteran's. # Thanksgiving # Christmas
"2017-01-01","2017-01-16","2017-02-13","2017-02-20","2017-05-29","2017-07-04","2017-09-04","2017-10-09","2017-11-07","2017-11-11","2017-11-23","2017-12-25",
"2018-01-01","2018-01-15","2018-02-12","2018-02-19","2018-05-28","2018-07-04","2018-09-03","2018-10-08","2018-11-06","2018-11-11","2018-11-22","2018-12-25",
"2019-01-01","2019-01-21","2019-02-12","2019-02-18","2019-05-27","2019-07-04","2019-09-02","2019-10-14","2019-11-05","2019-11-11","2019-11-28","2019-12-25",
]
data_df.loc[:, "holidays"] = 0
data_df.loc[data_df["yobi"]>=5, "holidays"] = 1 # 土日 5,6
data_df.loc[data_df["ds_ymd"].isin(list_Legal_Holidays), "holidays"] = 1 # 祝日
# 1週間前休日だったかフラグ
data_df.loc[:, "lag_7_holidays"] = data_df[["id", "holidays"]].groupby("id")["holidays"].transform(lambda x: x.shift(periods=48*7))
## 過去の情報を特徴量に追加
col_kako = [7,8,14,15,21,28,35,42,49,56,7*51,7*52,7*53,7*26,]
### 1,2,3,4…週間前の階差
data_df.loc[:,
["lag_"+str(i)+"_y_diff" for i in col_kako]
] = create_lags(data_df, ["id","hour_minute"], "y_diff", col_kako)
data_df.loc[:,"lag_343_y_diff"] = create_lags(data_df, ["id"], "y_diff", [7*48+1]) # 7日前の30分前
### 1,2,3,4…週間前の原系列
data_df.loc[:,
["lag_"+str(i)+"_y" for i in col_kako]
] = create_lags(data_df, ["id","hour_minute"], "y", col_kako)
data_df.loc[:,"lag_343_y"] = create_lags(data_df, ["id"], "y", [7*48+1]) # 7日前の30分前
### ratio(1,2,3,4…週間前の階差列 ÷ 原系列)
for i in col_kako+[343]:
data_df.loc[:,"lag_"+str(i)+"_y_ratio"] = data_df.loc[:,"lag_"+str(i)+"_y_diff"] / data_df.loc[:,"lag_"+str(i)+"_y"]
data_df.loc[data_df["lag_"+str(i)+"_y_ratio"].isnull(),"lag_"+str(i)+"_y_ratio"] = 0
data_df.loc[data_df["lag_"+str(i)+"_y"]==0,"lag_"+str(i)+"_y_ratio"] = 0
### 移動平均
col_rolls = [48*1, 48*2, 48*3, 48*4, 48*5, 48*6, 48*7, 48*8, 48*9, 48*10, 48*11, 48*12, 48*13, 48*14]
# 1週間前から過去48*n点の移動平均,標準偏差,最大,最小
data_df.loc[:,
["ma_1_"+str(i)+"_lag_7_y_diff" for i in col_rolls]
] = create_rolls_ma(data_df, ["id"], "lag_7_y_diff", [1], col_rolls)
data_df.loc[:,
["std_1_"+str(i)+"_lag_7_y_diff" for i in col_rolls]
] = create_rolls_std(data_df, ["id"], "lag_7_y_diff", [1], col_rolls)
data_df.loc[:,
["max_1_"+str(i)+"_lag_7_y_diff" for i in col_rolls]
] = create_rolls_max(data_df, ["id"], "lag_7_y_diff", [1], col_rolls)
data_df.loc[:,
["min_1_"+str(i)+"_lag_7_y_diff" for i in col_rolls]
] = create_rolls_min(data_df, ["id"], "lag_7_y_diff", [1], col_rolls)
### cumsum
col_cumsum = [7,14,21,28,35,42,49,56,]
#### 階差列
temp_cumsum = data_df[["lag_"+str(i)+"_y_diff" for i in col_cumsum]].cumsum(axis=1).iloc[:,-1*len(col_cumsum)+1:]
temp_cumsum.columns = ["lag_"+str(i)+"_y_diff_cumsum" for i in col_cumsum[1:]]
data_df.loc[:,["lag_"+str(i)+"_y_diff_cumsum" for i in col_cumsum[1:]]] = temp_cumsum
del temp_cumsum
#### 原系列(移動平均)
temp_cumsum = data_df[["lag_"+str(i)+"_y" for i in col_cumsum]].cumsum(axis=1).iloc[:,-1*len(col_cumsum)+1:] / np.arange(2, len(col_cumsum)+1)
temp_cumsum.columns = ["lag_"+str(i)+"_y_cumsum" for i in col_cumsum[1:]]
data_df.loc[:,["lag_"+str(i)+"_y_cumsum" for i in col_cumsum[1:]]] = temp_cumsum
del temp_cumsum
#### ratio列
temp_cumsum = data_df[["lag_"+str(i)+"_y_ratio" for i in col_cumsum]].cumsum(axis=1).iloc[:,-1*len(col_cumsum)+1:]
temp_cumsum.columns = ["lag_"+str(i)+"_y_ratio_cumsum" for i in col_cumsum[1:]]
data_df.loc[:,["lag_"+str(i)+"_y_ratio_cumsum" for i in col_cumsum[1:]]] = temp_cumsum
del temp_cumsum
#### cumsumのratio
for i in col_cumsum[1:]:
data_df.loc[:,"lag_"+str(i)+"_y_diff_cumsum_ratio"] = data_df.loc[:,"lag_"+str(i)+"_y_diff_cumsum"] / data_df.loc[:,"lag_"+str(i)+"_y_cumsum"]
data_df.loc[data_df["lag_"+str(i)+"_y_diff_cumsum_ratio"].isnull(),"lag_"+str(i)+"_y_ratio"] = 0
## 天気関連
data_df = pd.merge(data_df, weather_, how="left", on="ds_h")
for col in col_weather:
## 未来48*n点の平均
data_df.loc[:,"ma_-48_48_"+col] = create_rolls_ma(data_df, ["id"], col, [-48], [48*1])
# 原系列の前日差分
data_df.loc[:,"lag_1_"+col+"_diff"] = data_df[col] - data_df[["id", col]].groupby("id")[col].transform(lambda x: x.shift(periods=48*1))
data_df.loc[:,"lag_1_"+col+"_ratio"] = data_df.loc[:,"lag_1_"+col+"_diff"] / data_df.loc[:,col]
data_df.loc[data_df["lag_1_"+col+"_ratio"].isnull(),"lag_1_"+col+"_ratio"] = 0
data_df.loc[data_df[col]==0,"lag_1_"+col+"_ratio"] = 0
# 原系列の1週間前差分
data_df.loc[:,"lag_7_"+col+"_diff"] = data_df[col] - data_df[["id", col]].groupby("id")[col].transform(lambda x: x.shift(periods=48*7))
data_df.loc[:,"lag_7_"+col+"_ratio"] = data_df.loc[:,"lag_7_"+col+"_diff"] / data_df.loc[:,col]
data_df.loc[data_df["lag_7_"+col+"_ratio"].isnull(),"lag_7_"+col+"_ratio"] = 0
data_df.loc[data_df[col]==0,"lag_7_"+col+"_ratio"] = 0
# 移動平均列の前日差分
data_df.loc[:,"lag_1_"+"ma_-48_48_"+col+"_diff"] = data_df["ma_-48_48_"+col] - data_df[["id", "ma_-48_48_"+col]].groupby("id")["ma_-48_48_"+col].transform(lambda x: x.shift(periods=48*1))
data_df.loc[:,"lag_1_"+"ma_-48_48_"+col+"_ratio"] = data_df.loc[:,"lag_1_"+"ma_-48_48_"+col+"_diff"] / data_df.loc[:,"ma_-48_48_"+col]
data_df.loc[data_df["lag_1_"+"ma_-48_48_"+col+"_ratio"].isnull(),"lag_1_"+"ma_-48_48_"+col+"_ratio"] = 0
data_df.loc[data_df["ma_-48_48_"+col]==0,"lag_1_"+"ma_-48_48_"+col+"_ratio"] = 0
# 移動平均列の1週間前差分
data_df.loc[:,"lag_7_"+"ma_-48_48_"+col+"_diff"] = data_df["ma_-48_48_"+col] - data_df[["id", "ma_-48_48_"+col]].groupby("id")["ma_-48_48_"+col].transform(lambda x: x.shift(periods=48*7))
data_df.loc[:,"lag_7_"+"ma_-48_48_"+col+"_ratio"] = data_df.loc[:,"lag_7_"+"ma_-48_48_"+col+"_diff"] / data_df.loc[:,"ma_-48_48_"+col]
data_df.loc[data_df["lag_7_"+"ma_-48_48_"+col+"_ratio"].isnull(),"lag_7_"+"ma_-48_48_"+col+"_ratio"] = 0
data_df.loc[data_df["ma_-48_48_"+col]==0,"lag_7_"+"ma_-48_48_"+col+"_ratio"] = 0
print(data_df.shape)
# 確認のため、id==0の00:00、00:30だけ表示
data_df[((data_df["id"]==0)&((data_df["hour_minute"]=="0000")|(data_df["hour_minute"]=="0030")))].head(100)