Oregin
民泊サービスの宿泊料金予測のサンプルコードです。ご参考までご活用ください。
※Google Colabで実行可能です。
CV=0.78974 LB=0.89032 でした。
ディレクトリ構成
# カレントディレクトリをnotebook,result,dataディレクトリが格納されているディレクトリに移動
%cd /xxxx/xxxx
!pip install git+https://github.com/pfnet-research/xfeat.git
#環境確認
import pandas as pd
import numpy as np
!python3 --version
print(pd.__version__)
print(np.__version__)
import matplotlib
print(matplotlib.__version__)
import pandas as pd
import numpy as np
import random
import matplotlib.pylab as plt
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold,KFold
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_squared_log_error
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, \
ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer
Python 3.7.12 1.3.5 1.21.5 3.2.2
#データの読み込み
train_df = pd.read_csv("./data/train_data.csv")
test_df = pd.read_csv("./data/test_data.csv")
submit_df = pd.read_csv("./data/submission.csv")
print(train_df.shape)
print(test_df.shape)
(9990, 13) (4996, 12)
#データの確認
train_df.head()
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre... | 242899459 | Koto Ku | 35.68185 | 139.80310 | Entire home/apt | 1 | 55 | 2020-04-25 | 2.21 | 173 | 12008 |
1 | 2 | Downtown Tokyo Iriya next to Ueno | 308879948 | Taito Ku | 35.72063 | 139.78536 | Entire home/apt | 6 | 72 | 2020-03-25 | 2.11 | 9 | 6667 |
2 | 3 | Japan Style,Private,Affordable,4min to Sta. | 300877823 | Katsushika Ku | 35.74723 | 139.82349 | Entire home/apt | 1 | 18 | 2020-03-23 | 3.46 | 288 | 9923 |
3 | 4 | 4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi | 236935461 | Shibuya Ku | 35.68456 | 139.68077 | Entire home/apt | 1 | 2 | 2020-04-02 | 1.76 | 87 | 8109 |
4 | 5 | LICENSED SHINJUKU HOUSE: Heart of the action! | 243408889 | Shinjuku Ku | 35.69840 | 139.70467 | Entire home/apt | 1 | 86 | 2020-01-30 | 2.00 | 156 | 100390 |
# カラム名の確認
train_df.columns
Index(['id', 'name', 'host_id', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'availability_365', 'y'], dtype='object')
# 各カラムの種類の数を確認
collist = []
for colname in train_df.columns:
lencol = len(train_df[colname].unique())
print(colname,lencol)
if lencol < 1000 and colname != 'y':
collist.append(colname)
print(collist)
id 9990 name 9114 host_id 2325 neighbourhood 23 latitude 6239 longitude 6867 room_type 4 minimum_nights 30 number_of_reviews 261 last_review 547 reviews_per_month 595 availability_365 366 y 7520 ['neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'availability_365']
# エンコーディングするカラム
train_columns = ['neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'availability_365']
# 目的変数のカラム
target = 'y'
X = train_df[train_columns]
y = train_df[target]
#説明変数の作成
test_X = test_df[train_columns]
train_target = pd.concat([X,y],axis=1)
# ターゲットエンコーディング
fold = KFold(n_splits=5, shuffle=False)
train = X.copy()
test = test_X.copy()
for col in train_columns:
encoder = TargetEncoder(
input_cols=[col],
target_col=target,
fold=fold,
output_suffix="_re"
)
encoded_df = encoder.fit_transform(train_target)
train = pd.concat([train,encoded_df[f'{col}_re']],axis=1)
encoded_df = encoder.transform(test)
test = pd.concat([test,encoded_df[f'{col}_re']],axis=1)
train.drop(train_columns,axis=1,inplace=True)
test.drop(train_columns,axis=1,inplace=True)
# 欠損がないことの確認
train.isnull().sum(),test.isnull().sum()
(neighbourhood_re 0 room_type_re 0 minimum_nights_re 0 number_of_reviews_re 0 last_review_re 0 reviews_per_month_re 0 availability_365_re 0 dtype: int64, neighbourhood_re 0 room_type_re 0 minimum_nights_re 0 number_of_reviews_re 0 last_review_re 0 reviews_per_month_re 0 availability_365_re 0 dtype: int64)
# 特徴量を追加する関数
def make_feat(df):
df["neighbourhood_re"] = df["neighbourhood_re"]
df["room_type_re"] = df["room_type_re"]
df["minimum_nights_re"] = df["minimum_nights_re"]
df["number_of_reviews_re"] = df["number_of_reviews_re"]
df["last_review_re"] = df["last_review_re"]
df["reviews_per_month_re"] = df["reviews_per_month_re"]
df["availability_365_re"] = df["availability_365_re"]
df["minimum_nights_re*neighbourhood_re"] = df["minimum_nights_re"]*df["neighbourhood_re"]
df["availability_365_re*neighbourhood_re"] = df["availability_365_re"]*df["neighbourhood_re"]
df["neighbourhood_re**2*reviews_per_month_re"] = df["neighbourhood_re"]**2*df["reviews_per_month_re"]
df["availability_365_re**2*neighbourhood_re**2"] = df["availability_365_re"]**2*df["neighbourhood_re"]**2
df["availability_365_re/room_type_re"] = df["availability_365_re"]/df["room_type_re"]
df["neighbourhood_re**2/room_type_re"] = df["neighbourhood_re"]**2/df["room_type_re"]
df["availability_365_re**2/room_type_re"] = df["availability_365_re"]**2/df["room_type_re"]
df["neighbourhood_re*room_type_re**2"] = df["neighbourhood_re"]*df["room_type_re"]**2
df["availability_365_re**2*minimum_nights_re**2"] = df["availability_365_re"]**2*df["minimum_nights_re"]**2
df["availability_365_re**2/reviews_per_month_re"] = df["availability_365_re"]**2/df["reviews_per_month_re"]
df["availability_365_re**2*number_of_reviews_re**2"] = df["availability_365_re"]**2*df["number_of_reviews_re"]**2
df["sqrt(availability_365_re)*minimum_nights_re"] = np.sqrt(df["availability_365_re"])*df["minimum_nights_re"]
df["log(last_review_re)*log(reviews_per_month_re)"] = np.log(df["last_review_re"])*np.log(df["reviews_per_month_re"])
# 特徴量の追加
make_feat(train)
make_feat(test)
train.shape,test.shape
((9990, 20), (4996, 20))
# 目的変数を対数化
target = np.log1p(y)
all_param = {
"colsample_bytree": 0.32714832683589756,
"learning_rate": 0.006840905564844016,
"max_bin": 166,
"min_child_samples": 5,
"n_estimators": 573,
"num_leaves": 114,
"subsample": 0.956238192643021,
"subsample_freq": 3,
}
#LGBMで学習する関数
def objective(all_param):
x_train = train.copy()
y_train = target.copy()
x_test = test.copy()
# --------------------------------------
# パラメータセット
# --------------------------------------
lgb_params = {
'objective': 'regression',
'importance_type': 'gain',
'metric': 'rmse',
'seed': 42,
'n_jobs': -1,
'verbose': 1,
'n_estimators': all_param['n_estimators'],
'learning_rate': all_param['learning_rate'],
'boosting_type': 'gbdt',
'subsample': all_param['subsample'],
'subsample_freq': all_param['subsample_freq'],
'colsample_bytree': all_param['colsample_bytree'],
'num_leaves': all_param['num_leaves'],
'min_child_samples': all_param['min_child_samples'],
'max_bin': all_param['max_bin'],
}
# --------------------------------------
# 学習
# --------------------------------------
x_tr_fold, x_vl_fold, y_tr_fold, y_vl_fold = train_test_split(x_train, y_train, test_size=0.1, random_state=42)
y_oof = np.zeros(len(x_vl_fold))
y_preds = np.zeros(len(x_test))
model = lgb.LGBMRegressor(**lgb_params)
model.fit(
x_tr_fold, y_tr_fold,
eval_set=(x_vl_fold, y_vl_fold),
eval_metric='rmse',
verbose=False,
early_stopping_rounds=100,
)
y_oof = model.predict(x_vl_fold)
score = np.sqrt(mean_squared_error(y_vl_fold,y_oof))
print(
'oof score:',
score
)
# --------------------------------------
# 予測
# --------------------------------------
pred_data = model.predict(x_test)
return score,pred_data
score,pred_data = objective(all_param)
oof score: 0.7897490783498501
# 対数化された予測値を戻す。
pred_data = np.expm1(pred_data)
submit_df = pd.read_csv(f"./data/submission.csv")
submit_df['y']=pd.Series(pred_data.reshape(-1,))
submit_df.to_csv(f'./result/submission.csv',index=False)