一泊の適正価格はいくら?
Oregin
民泊サービスの宿泊料金予測のサンプルコードです。ご参考までご活用ください。
※Google Colabで実行可能です。
CV=0.78974 LB=0.89032 でした。
ディレクトリ構成
# カレントディレクトリをnotebook,result,dataディレクトリが格納されているディレクトリに移動 %cd /xxxx/xxxx
!pip install git+https://github.com/pfnet-research/xfeat.git
#環境確認 import pandas as pd import numpy as np !python3 --version print(pd.__version__) print(np.__version__) import matplotlib print(matplotlib.__version__) import pandas as pd import numpy as np import random import matplotlib.pylab as plt import scipy.stats as stats from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import GroupKFold,KFold from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_squared_log_error from sklearn.model_selection import train_test_split import lightgbm as lgb import warnings warnings.filterwarnings('ignore') from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, \ ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer
Python 3.7.12 1.3.5 1.21.5 3.2.2
#データの読み込み train_df = pd.read_csv("./data/train_data.csv") test_df = pd.read_csv("./data/test_data.csv") submit_df = pd.read_csv("./data/submission.csv") print(train_df.shape) print(test_df.shape)
(9990, 13) (4996, 12)
#データの確認 train_df.head()
# カラム名の確認 train_df.columns
Index(['id', 'name', 'host_id', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'availability_365', 'y'], dtype='object')
# 各カラムの種類の数を確認 collist = [] for colname in train_df.columns: lencol = len(train_df[colname].unique()) print(colname,lencol) if lencol < 1000 and colname != 'y': collist.append(colname) print(collist)
id 9990 name 9114 host_id 2325 neighbourhood 23 latitude 6239 longitude 6867 room_type 4 minimum_nights 30 number_of_reviews 261 last_review 547 reviews_per_month 595 availability_365 366 y 7520 ['neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'availability_365']
# エンコーディングするカラム train_columns = ['neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'availability_365'] # 目的変数のカラム target = 'y'
X = train_df[train_columns] y = train_df[target]
#説明変数の作成 test_X = test_df[train_columns]
train_target = pd.concat([X,y],axis=1)
# ターゲットエンコーディング fold = KFold(n_splits=5, shuffle=False) train = X.copy() test = test_X.copy() for col in train_columns: encoder = TargetEncoder( input_cols=[col], target_col=target, fold=fold, output_suffix="_re" ) encoded_df = encoder.fit_transform(train_target) train = pd.concat([train,encoded_df[f'{col}_re']],axis=1) encoded_df = encoder.transform(test) test = pd.concat([test,encoded_df[f'{col}_re']],axis=1) train.drop(train_columns,axis=1,inplace=True) test.drop(train_columns,axis=1,inplace=True)
# 欠損がないことの確認 train.isnull().sum(),test.isnull().sum()
(neighbourhood_re 0 room_type_re 0 minimum_nights_re 0 number_of_reviews_re 0 last_review_re 0 reviews_per_month_re 0 availability_365_re 0 dtype: int64, neighbourhood_re 0 room_type_re 0 minimum_nights_re 0 number_of_reviews_re 0 last_review_re 0 reviews_per_month_re 0 availability_365_re 0 dtype: int64)
# 特徴量を追加する関数 def make_feat(df): df["neighbourhood_re"] = df["neighbourhood_re"] df["room_type_re"] = df["room_type_re"] df["minimum_nights_re"] = df["minimum_nights_re"] df["number_of_reviews_re"] = df["number_of_reviews_re"] df["last_review_re"] = df["last_review_re"] df["reviews_per_month_re"] = df["reviews_per_month_re"] df["availability_365_re"] = df["availability_365_re"] df["minimum_nights_re*neighbourhood_re"] = df["minimum_nights_re"]*df["neighbourhood_re"] df["availability_365_re*neighbourhood_re"] = df["availability_365_re"]*df["neighbourhood_re"] df["neighbourhood_re**2*reviews_per_month_re"] = df["neighbourhood_re"]**2*df["reviews_per_month_re"] df["availability_365_re**2*neighbourhood_re**2"] = df["availability_365_re"]**2*df["neighbourhood_re"]**2 df["availability_365_re/room_type_re"] = df["availability_365_re"]/df["room_type_re"] df["neighbourhood_re**2/room_type_re"] = df["neighbourhood_re"]**2/df["room_type_re"] df["availability_365_re**2/room_type_re"] = df["availability_365_re"]**2/df["room_type_re"] df["neighbourhood_re*room_type_re**2"] = df["neighbourhood_re"]*df["room_type_re"]**2 df["availability_365_re**2*minimum_nights_re**2"] = df["availability_365_re"]**2*df["minimum_nights_re"]**2 df["availability_365_re**2/reviews_per_month_re"] = df["availability_365_re"]**2/df["reviews_per_month_re"] df["availability_365_re**2*number_of_reviews_re**2"] = df["availability_365_re"]**2*df["number_of_reviews_re"]**2 df["sqrt(availability_365_re)*minimum_nights_re"] = np.sqrt(df["availability_365_re"])*df["minimum_nights_re"] df["log(last_review_re)*log(reviews_per_month_re)"] = np.log(df["last_review_re"])*np.log(df["reviews_per_month_re"])
# 特徴量の追加 make_feat(train) make_feat(test) train.shape,test.shape
((9990, 20), (4996, 20))
# 目的変数を対数化 target = np.log1p(y)
all_param = { "colsample_bytree": 0.32714832683589756, "learning_rate": 0.006840905564844016, "max_bin": 166, "min_child_samples": 5, "n_estimators": 573, "num_leaves": 114, "subsample": 0.956238192643021, "subsample_freq": 3, }
#LGBMで学習する関数 def objective(all_param): x_train = train.copy() y_train = target.copy() x_test = test.copy() # -------------------------------------- # パラメータセット # -------------------------------------- lgb_params = { 'objective': 'regression', 'importance_type': 'gain', 'metric': 'rmse', 'seed': 42, 'n_jobs': -1, 'verbose': 1, 'n_estimators': all_param['n_estimators'], 'learning_rate': all_param['learning_rate'], 'boosting_type': 'gbdt', 'subsample': all_param['subsample'], 'subsample_freq': all_param['subsample_freq'], 'colsample_bytree': all_param['colsample_bytree'], 'num_leaves': all_param['num_leaves'], 'min_child_samples': all_param['min_child_samples'], 'max_bin': all_param['max_bin'], } # -------------------------------------- # 学習 # -------------------------------------- x_tr_fold, x_vl_fold, y_tr_fold, y_vl_fold = train_test_split(x_train, y_train, test_size=0.1, random_state=42) y_oof = np.zeros(len(x_vl_fold)) y_preds = np.zeros(len(x_test)) model = lgb.LGBMRegressor(**lgb_params) model.fit( x_tr_fold, y_tr_fold, eval_set=(x_vl_fold, y_vl_fold), eval_metric='rmse', verbose=False, early_stopping_rounds=100, ) y_oof = model.predict(x_vl_fold) score = np.sqrt(mean_squared_error(y_vl_fold,y_oof)) print( 'oof score:', score ) # -------------------------------------- # 予測 # -------------------------------------- pred_data = model.predict(x_test) return score,pred_data
score,pred_data = objective(all_param)
oof score: 0.7897490783498501
# 対数化された予測値を戻す。 pred_data = np.expm1(pred_data)
submit_df = pd.read_csv(f"./data/submission.csv") submit_df['y']=pd.Series(pred_data.reshape(-1,)) submit_df.to_csv(f'./result/submission.csv',index=False)