一泊の適正価格はいくら?
shirapon24
初手にLightGBMを使用する理由についてはu++さんの記事が参考になります
簡単なコードが読めることとデータコンペの流れを理解していることを前提としています
# -*- coding: utf-8 -*- import numpy as np import pandas as pd pd.set_option('max_colwidth', 500) pd.set_option('max_columns', 500) pd.set_option('max_rows', 500) %matplotlib inline from matplotlib import pyplot as plt import seaborn as sns import japanize_matplotlib import json, os, gc, math, time import datetime import collections from tqdm import tqdm import glob from statistics import mean from sklearn.preprocessing import MultiLabelBinarizer from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold from sklearn import metrics import lightgbm as lgb import warnings warnings.filterwarnings("ignore")
class Config(): NAME = 'sample_for_beginner' INPUT_PATH = '../input/' OUTPUT_PATH = '../output/' SUBMIT_PATH = '../submission/' TARGET = 'y' # this is fare NUM_FOLD = 5 SEED_FOLD = 71 SEED_MODEL = 42 CFG = Config()
train = pd.read_csv(os.path.join(CFG.INPUT_PATH, 'train_data.csv')) test = pd.read_csv(os.path.join(CFG.INPUT_PATH, 'test_data.csv')) sample_submission = pd.read_csv(os.path.join(CFG.INPUT_PATH, 'submission.csv'))
train.head()
from matplotlib_venn import venn2 def plot_venn_train_test(tra, val, col): """trainとtestのベン図をplotする """ fig, ax = plt.subplots(figsize=(6,9)) plt.title(col, fontsize=10) train_unique = tra[col].unique() test_unique = val[col].unique() common_num = len(set(train_unique) & set(test_unique)) venn2(subsets=(len(train_unique)-common_num, len(test_unique)-common_num, common_num),set_labels=('Train', 'Test')) return fig, ax
fig, ax = plot_venn_train_test(train, test, "host_id") # ホストIDはかぶってないので不要
fig, ax = plot_venn_train_test(train, test, "name") # めっさ少しかぶってる
fig, ax = plot_venn_train_test(train, test, "neighbourhood") # 完全に一致
def get_label_encoding(df , cols): """label_encoding """ for col in cols: df[col].fillna("missing", inplace=True) le = LabelEncoder() le = le.fit(df[col]) df["LE=" + col] = le.transform(df[col]) return df def get_count_encoding(df, cols): """count_encoding """ for col in cols: counter = collections.Counter(df[col].values) count_dict = dict(counter.most_common()) encoded = df[col].map(lambda x: count_dict[x]).values df["CE=" + col] = encoded return df
def get_date_feature(_df, col): """date to feature """ date_siries = pd.to_datetime(_df[col]) _df[col + "_year"] = date_siries.dt.year _df[col + "_month"] = date_siries.dt.month _df[col + "_day"] = date_siries.dt.day _df[col + "_week"] = date_siries.dt.dayofweek _df[col + "_yymmdd"] = date_siries.dt.strftime('%Y%m%d').astype(np.int32) return _df
def create_feature(_tra, _val): """特徴量生成 """ # 訓練データとテストデータの結合 _df = pd.concat([_tra, _val], axis=0).reset_index(drop=True) # 日付データの処理 _df["last_review"] = _df["last_review"].fillna(0) _df = get_date_feature(_df, "last_review") # カテゴリーエンコーディング cat_cols = ["name", "neighbourhood"] _df = get_label_encoding(_df, cat_cols) _df = get_count_encoding(_df, cat_cols) _df = pd.get_dummies(_df, columns=["room_type"]) # 不要なカラムの削除 _df = _df.drop(["id", "host_id", "last_review", "name", "neighbourhood"], axis=1) # 再度訓練データとテストデータを分割 _tra = _df.iloc[:_tra.shape[0], :] _val = _df.iloc[_tra.shape[0]:, :] # 目的変数を取り出して返す target = _tra[CFG.TARGET] _tra = _tra.drop(CFG.TARGET, axis=1) _val = _val.drop(CFG.TARGET, axis=1) return _tra, _val, target
train_df, test_df, target = create_feature(train.copy(), test.copy())
train_df.head()
def fit_lgbm(train, test, y, groups=None, params: dict=None, n_splits=5, verbose=100, early_stopping_rounds=100): """train lightgbm """ models = [] scores = [] iterations = [] oof_preds = np.zeros((train.shape[0],)) sub_preds = np.zeros((test.shape[0],)) folds = StratifiedKFold(n_splits=n_splits, random_state=CFG.SEED_FOLD, shuffle=True) for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, target)): print("Fold is :", n_fold+1) trn_x, trn_y = train.iloc[trn_idx], y.iloc[trn_idx] val_x, val_y = train.iloc[val_idx], y.iloc[val_idx] trn_x = trn_x.values val_x = val_x.values clf = lgb.LGBMRegressor(**params) clf.fit(trn_x, trn_y, eval_set= [(trn_x, trn_y), (val_x, val_y)], eval_metric="rmse", verbose=verbose, early_stopping_rounds=early_stopping_rounds ) oof_preds[val_idx] = clf.predict(val_x, num_iteration=clf.best_iteration_) sub_preds += clf.predict(test, num_iteration=clf.best_iteration_) / n_splits gc.collect() oof_preds = np.clip(oof_preds, 0, np.inf) sub_preds = np.clip(sub_preds, 0, np.inf) score = np.sqrt(metrics.mean_squared_error(y[val_idx], oof_preds[val_idx])) print("CV:{} RMSLE:{}".format(n_fold+1,score)) iterations.append(clf.best_iteration_) scores.append(score) models.append(clf) return oof_preds, sub_preds, models, scores
params = { 'objective': 'rmse', 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': -1, 'num_leaves': 31, 'n_estimators': 100000, "importance_type": "gain", 'random_state': CFG.SEED_MODEL, }
oof_pred, sub_pred, models, fold_scores = fit_lgbm(train_df, test_df, np.log1p(target), params=params, n_splits=5, early_stopping_rounds=100)
Fold is : 1 Training until validation scores don't improve for 100 rounds [100] training's rmse: 0.789085 valid_1's rmse: 0.802772 [200] training's rmse: 0.716151 valid_1's rmse: 0.745864 [300] training's rmse: 0.674589 valid_1's rmse: 0.72145 [400] training's rmse: 0.645677 valid_1's rmse: 0.706381 [500] training's rmse: 0.624051 valid_1's rmse: 0.695621 [600] training's rmse: 0.605819 valid_1's rmse: 0.687508 [700] training's rmse: 0.589237 valid_1's rmse: 0.680485 [800] training's rmse: 0.574465 valid_1's rmse: 0.675827 [900] training's rmse: 0.559317 valid_1's rmse: 0.670026 [1000] training's rmse: 0.546112 valid_1's rmse: 0.66458 [1100] training's rmse: 0.532641 valid_1's rmse: 0.6606 [1200] training's rmse: 0.521237 valid_1's rmse: 0.656926 [1300] training's rmse: 0.511559 valid_1's rmse: 0.65418 [1400] training's rmse: 0.500988 valid_1's rmse: 0.650753 [1500] training's rmse: 0.490622 valid_1's rmse: 0.647755 [1600] training's rmse: 0.481013 valid_1's rmse: 0.645375 [1700] training's rmse: 0.472153 valid_1's rmse: 0.643278 [1800] training's rmse: 0.463638 valid_1's rmse: 0.641326 [1900] training's rmse: 0.455072 valid_1's rmse: 0.639423 [2000] training's rmse: 0.447044 valid_1's rmse: 0.637786 [2100] training's rmse: 0.439478 valid_1's rmse: 0.636327 [2200] training's rmse: 0.432123 valid_1's rmse: 0.63539 [2300] training's rmse: 0.425764 valid_1's rmse: 0.634364 [2400] training's rmse: 0.41998 valid_1's rmse: 0.633321 [2500] training's rmse: 0.413686 valid_1's rmse: 0.632166 [2600] training's rmse: 0.407444 valid_1's rmse: 0.631164 [2700] training's rmse: 0.401614 valid_1's rmse: 0.629924 [2800] training's rmse: 0.395729 valid_1's rmse: 0.629011 [2900] training's rmse: 0.389618 valid_1's rmse: 0.628166 [3000] training's rmse: 0.384 valid_1's rmse: 0.627495 [3100] training's rmse: 0.379002 valid_1's rmse: 0.626662 [3200] training's rmse: 0.374175 valid_1's rmse: 0.625922 [3300] training's rmse: 0.369425 valid_1's rmse: 0.625368 [3400] training's rmse: 0.364725 valid_1's rmse: 0.625202 [3500] training's rmse: 0.3602 valid_1's rmse: 0.624725 [3600] training's rmse: 0.355911 valid_1's rmse: 0.624396 [3700] training's rmse: 0.350962 valid_1's rmse: 0.624025 [3800] training's rmse: 0.34651 valid_1's rmse: 0.623659 [3900] training's rmse: 0.341824 valid_1's rmse: 0.623331 [4000] training's rmse: 0.337538 valid_1's rmse: 0.623044 [4100] training's rmse: 0.333395 valid_1's rmse: 0.622746 [4200] training's rmse: 0.329732 valid_1's rmse: 0.622372 [4300] training's rmse: 0.325499 valid_1's rmse: 0.622099 [4400] training's rmse: 0.321386 valid_1's rmse: 0.621748 [4500] training's rmse: 0.317763 valid_1's rmse: 0.621697 [4600] training's rmse: 0.314315 valid_1's rmse: 0.621193 [4700] training's rmse: 0.310391 valid_1's rmse: 0.621195 Early stopping, best iteration is: [4637] training's rmse: 0.312854 valid_1's rmse: 0.620984 CV:1 RMSLE:0.6209840569002207 Fold is : 2 Training until validation scores don't improve for 100 rounds [100] training's rmse: 0.783872 valid_1's rmse: 0.828208 [200] training's rmse: 0.713494 valid_1's rmse: 0.767349 [300] training's rmse: 0.672645 valid_1's rmse: 0.73795 [400] training's rmse: 0.645067 valid_1's rmse: 0.720696 [500] training's rmse: 0.622988 valid_1's rmse: 0.708938 [600] training's rmse: 0.603118 valid_1's rmse: 0.699293 [700] training's rmse: 0.585443 valid_1's rmse: 0.690751 [800] training's rmse: 0.569988 valid_1's rmse: 0.683753 [900] training's rmse: 0.555947 valid_1's rmse: 0.678555 [1000] training's rmse: 0.543755 valid_1's rmse: 0.675211 [1100] training's rmse: 0.532986 valid_1's rmse: 0.671581 [1200] training's rmse: 0.521339 valid_1's rmse: 0.667332 [1300] training's rmse: 0.510563 valid_1's rmse: 0.663425 [1400] training's rmse: 0.500514 valid_1's rmse: 0.659543 [1500] training's rmse: 0.491226 valid_1's rmse: 0.656081 [1600] training's rmse: 0.48254 valid_1's rmse: 0.653445 [1700] training's rmse: 0.474068 valid_1's rmse: 0.651548 [1800] training's rmse: 0.465859 valid_1's rmse: 0.64939 [1900] training's rmse: 0.458085 valid_1's rmse: 0.6469 [2000] training's rmse: 0.449935 valid_1's rmse: 0.644753 [2100] training's rmse: 0.442791 valid_1's rmse: 0.643242 [2200] training's rmse: 0.435491 valid_1's rmse: 0.641088 [2300] training's rmse: 0.427976 valid_1's rmse: 0.63923 [2400] training's rmse: 0.421414 valid_1's rmse: 0.637892 [2500] training's rmse: 0.41499 valid_1's rmse: 0.636368 [2600] training's rmse: 0.408449 valid_1's rmse: 0.634886 [2700] training's rmse: 0.40237 valid_1's rmse: 0.633343 [2800] training's rmse: 0.396734 valid_1's rmse: 0.631861 [2900] training's rmse: 0.390865 valid_1's rmse: 0.630505 [3000] training's rmse: 0.385082 valid_1's rmse: 0.629458 [3100] training's rmse: 0.379436 valid_1's rmse: 0.628069 [3200] training's rmse: 0.374271 valid_1's rmse: 0.627231 [3300] training's rmse: 0.368944 valid_1's rmse: 0.626139 [3400] training's rmse: 0.363675 valid_1's rmse: 0.625123 [3500] training's rmse: 0.358755 valid_1's rmse: 0.624234 [3600] training's rmse: 0.354449 valid_1's rmse: 0.623696 [3700] training's rmse: 0.350146 valid_1's rmse: 0.623421 [3800] training's rmse: 0.345843 valid_1's rmse: 0.623195 [3900] training's rmse: 0.341444 valid_1's rmse: 0.622701 [4000] training's rmse: 0.337261 valid_1's rmse: 0.622132 [4100] training's rmse: 0.333238 valid_1's rmse: 0.621441 [4200] training's rmse: 0.329461 valid_1's rmse: 0.620881 [4300] training's rmse: 0.325267 valid_1's rmse: 0.620612 [4400] training's rmse: 0.321385 valid_1's rmse: 0.620239 [4500] training's rmse: 0.318 valid_1's rmse: 0.620172 [4600] training's rmse: 0.314528 valid_1's rmse: 0.619981 [4700] training's rmse: 0.311251 valid_1's rmse: 0.619665 [4800] training's rmse: 0.30759 valid_1's rmse: 0.619104 [4900] training's rmse: 0.304468 valid_1's rmse: 0.619008 [5000] training's rmse: 0.301197 valid_1's rmse: 0.618731 [5100] training's rmse: 0.298144 valid_1's rmse: 0.618284 [5200] training's rmse: 0.295067 valid_1's rmse: 0.61768 [5300] training's rmse: 0.291439 valid_1's rmse: 0.617327 [5400] training's rmse: 0.288039 valid_1's rmse: 0.617103 [5500] training's rmse: 0.285077 valid_1's rmse: 0.616997 [5600] training's rmse: 0.282244 valid_1's rmse: 0.616739 [5700] training's rmse: 0.279029 valid_1's rmse: 0.616504 [5800] training's rmse: 0.275853 valid_1's rmse: 0.616322 Early stopping, best iteration is: [5778] training's rmse: 0.276497 valid_1's rmse: 0.61623 CV:2 RMSLE:0.6162296952189872 Fold is : 3 Training until validation scores don't improve for 100 rounds [100] training's rmse: 0.790352 valid_1's rmse: 0.797812 [200] training's rmse: 0.717196 valid_1's rmse: 0.739492 [300] training's rmse: 0.67635 valid_1's rmse: 0.710418 [400] training's rmse: 0.648113 valid_1's rmse: 0.694873 [500] training's rmse: 0.625695 valid_1's rmse: 0.684574 [600] training's rmse: 0.606326 valid_1's rmse: 0.676143 [700] training's rmse: 0.589447 valid_1's rmse: 0.669576 [800] training's rmse: 0.573177 valid_1's rmse: 0.663278 [900] training's rmse: 0.556794 valid_1's rmse: 0.656926 [1000] training's rmse: 0.542608 valid_1's rmse: 0.651551 [1100] training's rmse: 0.530601 valid_1's rmse: 0.64843 [1200] training's rmse: 0.518413 valid_1's rmse: 0.644849 [1300] training's rmse: 0.507341 valid_1's rmse: 0.641505 [1400] training's rmse: 0.497983 valid_1's rmse: 0.63892 [1500] training's rmse: 0.488871 valid_1's rmse: 0.636364 [1600] training's rmse: 0.479129 valid_1's rmse: 0.63335 [1700] training's rmse: 0.470326 valid_1's rmse: 0.63149 [1800] training's rmse: 0.462039 valid_1's rmse: 0.629597 [1900] training's rmse: 0.454314 valid_1's rmse: 0.628031 [2000] training's rmse: 0.446283 valid_1's rmse: 0.627229 [2100] training's rmse: 0.438736 valid_1's rmse: 0.626146 [2200] training's rmse: 0.431104 valid_1's rmse: 0.625039 [2300] training's rmse: 0.423795 valid_1's rmse: 0.624958 Early stopping, best iteration is: [2245] training's rmse: 0.427896 valid_1's rmse: 0.624697 CV:3 RMSLE:0.6246969157999711 Fold is : 4 Training until validation scores don't improve for 100 rounds [100] training's rmse: 0.78891 valid_1's rmse: 0.785152 [200] training's rmse: 0.718129 valid_1's rmse: 0.730267 [300] training's rmse: 0.675322 valid_1's rmse: 0.703504 [400] training's rmse: 0.644702 valid_1's rmse: 0.687821 [500] training's rmse: 0.623176 valid_1's rmse: 0.679494 [600] training's rmse: 0.604177 valid_1's rmse: 0.671663 [700] training's rmse: 0.588387 valid_1's rmse: 0.664594 [800] training's rmse: 0.573005 valid_1's rmse: 0.658622 [900] training's rmse: 0.558595 valid_1's rmse: 0.65412 [1000] training's rmse: 0.545672 valid_1's rmse: 0.649986 [1100] training's rmse: 0.533605 valid_1's rmse: 0.646376 [1200] training's rmse: 0.522669 valid_1's rmse: 0.643622 [1300] training's rmse: 0.512197 valid_1's rmse: 0.641161 [1400] training's rmse: 0.501655 valid_1's rmse: 0.637781 [1500] training's rmse: 0.491799 valid_1's rmse: 0.635533 [1600] training's rmse: 0.483138 valid_1's rmse: 0.634113 [1700] training's rmse: 0.475249 valid_1's rmse: 0.632729 [1800] training's rmse: 0.467591 valid_1's rmse: 0.630967 [1900] training's rmse: 0.459909 valid_1's rmse: 0.629078 [2000] training's rmse: 0.452522 valid_1's rmse: 0.627366 [2100] training's rmse: 0.444864 valid_1's rmse: 0.626045 [2200] training's rmse: 0.437101 valid_1's rmse: 0.624389 [2300] training's rmse: 0.428759 valid_1's rmse: 0.622805 [2400] training's rmse: 0.421691 valid_1's rmse: 0.621507 [2500] training's rmse: 0.415013 valid_1's rmse: 0.620529 [2600] training's rmse: 0.407942 valid_1's rmse: 0.619293 [2700] training's rmse: 0.40124 valid_1's rmse: 0.617741 [2800] training's rmse: 0.395035 valid_1's rmse: 0.616626 [2900] training's rmse: 0.388784 valid_1's rmse: 0.615661 [3000] training's rmse: 0.382344 valid_1's rmse: 0.61482 [3100] training's rmse: 0.376878 valid_1's rmse: 0.614296 [3200] training's rmse: 0.371426 valid_1's rmse: 0.614205 [3300] training's rmse: 0.365927 valid_1's rmse: 0.613641 [3400] training's rmse: 0.36085 valid_1's rmse: 0.613228 [3500] training's rmse: 0.355385 valid_1's rmse: 0.612286 [3600] training's rmse: 0.349678 valid_1's rmse: 0.611869 [3700] training's rmse: 0.344328 valid_1's rmse: 0.611266 [3800] training's rmse: 0.339463 valid_1's rmse: 0.611105 [3900] training's rmse: 0.335176 valid_1's rmse: 0.610702 [4000] training's rmse: 0.330141 valid_1's rmse: 0.610428 [4100] training's rmse: 0.325849 valid_1's rmse: 0.60994 [4200] training's rmse: 0.321711 valid_1's rmse: 0.609504 [4300] training's rmse: 0.317556 valid_1's rmse: 0.609444 [4400] training's rmse: 0.313544 valid_1's rmse: 0.609221 [4500] training's rmse: 0.309808 valid_1's rmse: 0.609034 Early stopping, best iteration is: [4495] training's rmse: 0.309971 valid_1's rmse: 0.609012 CV:4 RMSLE:0.6090120445655745 Fold is : 5 Training until validation scores don't improve for 100 rounds [100] training's rmse: 0.785527 valid_1's rmse: 0.806675 [200] training's rmse: 0.714646 valid_1's rmse: 0.748941 [300] training's rmse: 0.672232 valid_1's rmse: 0.723304 [400] training's rmse: 0.642285 valid_1's rmse: 0.707281 [500] training's rmse: 0.618514 valid_1's rmse: 0.696302 [600] training's rmse: 0.599658 valid_1's rmse: 0.68958 [700] training's rmse: 0.582668 valid_1's rmse: 0.683641 [800] training's rmse: 0.567869 valid_1's rmse: 0.678612 [900] training's rmse: 0.552817 valid_1's rmse: 0.673357 [1000] training's rmse: 0.540353 valid_1's rmse: 0.669522 [1100] training's rmse: 0.527337 valid_1's rmse: 0.665172 [1200] training's rmse: 0.515884 valid_1's rmse: 0.661924 [1300] training's rmse: 0.506013 valid_1's rmse: 0.660081 [1400] training's rmse: 0.495232 valid_1's rmse: 0.657109 [1500] training's rmse: 0.485785 valid_1's rmse: 0.655172 [1600] training's rmse: 0.476823 valid_1's rmse: 0.653265 [1700] training's rmse: 0.467386 valid_1's rmse: 0.651088 [1800] training's rmse: 0.458662 valid_1's rmse: 0.649493 [1900] training's rmse: 0.450426 valid_1's rmse: 0.647798 [2000] training's rmse: 0.442206 valid_1's rmse: 0.646554 [2100] training's rmse: 0.434259 valid_1's rmse: 0.644975 [2200] training's rmse: 0.427574 valid_1's rmse: 0.643702 [2300] training's rmse: 0.420428 valid_1's rmse: 0.641861 [2400] training's rmse: 0.413929 valid_1's rmse: 0.640826 [2500] training's rmse: 0.407319 valid_1's rmse: 0.639825 [2600] training's rmse: 0.400933 valid_1's rmse: 0.63867 [2700] training's rmse: 0.394746 valid_1's rmse: 0.638278 [2800] training's rmse: 0.388922 valid_1's rmse: 0.637967 [2900] training's rmse: 0.383226 valid_1's rmse: 0.637335 [3000] training's rmse: 0.377749 valid_1's rmse: 0.636588 [3100] training's rmse: 0.372394 valid_1's rmse: 0.636091 [3200] training's rmse: 0.367216 valid_1's rmse: 0.636029 Early stopping, best iteration is: [3140] training's rmse: 0.370351 valid_1's rmse: 0.635991 CV:5 RMSLE:0.635991426846803
fold_scores
[0.6209840569002207, 0.6162296952189872, 0.6246969157999711, 0.6090120445655745, 0.635991426846803]
mean(fold_scores)
0.6213828278663113
import japanize_matplotlib dt_now = datetime.datetime.now() today = dt_now.strftime('%Y-%m-%d %H-%M') feature_importance_df = pd.DataFrame() for i, model in enumerate(models): _df = pd.DataFrame() _df['feature_importance'] = model.feature_importances_ _df['column'] = test_df.columns _df['fold'] = i + 1 feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True) order = feature_importance_df.groupby('column')\ .sum()[['feature_importance']]\ .sort_values('feature_importance', ascending=False).index[:100] fig, ax = plt.subplots(figsize=(12, 12)) plt.title(CFG.TARGET, fontsize=24) sns.set_theme(style="whitegrid") sns.barplot(data=feature_importance_df, x='feature_importance', y='column', order=order, palette='husl') ax.tick_params(axis='x', rotation=90) fig.tight_layout() plt.savefig(os.path.join(CFG.OUTPUT_PATH, CFG.NAME+"_feature_importance.png"))
sample_submission["y"] = np.expm1(sub_pred) sample_submission.to_csv(os.path.join(CFG.SUBMIT_PATH, CFG.NAME+".csv"), index=False)