Oregin
LightGBMを使ったBase lineです。ご参考までご活用ください。
CV= 0.438692 LB= 0.41466 でした。
# ライブラリのインポート
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
# データの読込
train = pd.read_csv("./input/train_data.csv")
test = pd.read_csv('./input/test_data.csv')
# 間取りデータを変換
def change_to_madori(train):
train['L'] = train['間取り'].map(lambda x: 1 if 'L' in str(x) else 0)
train['D'] = train['間取り'].map(lambda x: 1 if 'D' in str(x) else 0)
train['K'] = train['間取り'].map(lambda x: 1 if 'K' in str(x) else 0)
train['S'] = train['間取り'].map(lambda x: 1 if 'S' in str(x) else 0)
train['R'] = train['間取り'].map(lambda x: 1 if 'R' in str(x) else 0)
train['Maisonette'] = train['間取り'].map(lambda x: 1 if 'メゾネット' in str(x) else 0)
train['OpenFloor'] = train['間取り'].map(lambda x: 1 if 'オープンフロア' in str(x) else 0)
train['Studio'] = train['間取り'].map(lambda x: 1 if 'スタジオ' in str(x) else 0)
train['RoomNum'] = train['間取り'].map(lambda x: re.sub("\\D", "", str(x)))
train['RoomNum'] = train['RoomNum'].map(lambda x:int(x) if x!='' else 0)
change_to_madori(train)
change_to_madori(test)
# 数値化
def change_to_number(df,input_column_name,output_column_name):
df[output_column_name] = df[input_column_name].map(lambda x: re.sub(r'([0-9]+)m\^2未満', '9', str(x)))
df[output_column_name] = df[output_column_name].map(lambda x: re.sub("\\D", "", str(x)))
df[output_column_name] = df[output_column_name].map(lambda x:int(x) if x!='' else -1)
change_to_number(train,'延床面積(㎡)','TotalFloorArea')
change_to_number(test,'延床面積(㎡)','TotalFloorArea')
change_to_number(train,'面積(㎡)','Area')
change_to_number(test,'面積(㎡)','Area')
change_to_number(train,'取引時点',"Period")
change_to_number(test,'取引時点',"Period")
# 時間の変換
def change_to_minute(df,input_column_name,output_column_name):
df[output_column_name] = df[input_column_name].map(lambda x: re.sub("30分\?60分", "30", str(x)))
df[output_column_name] = df[output_column_name].map(lambda x: re.sub("2H\?", "120", str(x)))
df[output_column_name] = df[output_column_name].map(lambda x: re.sub("1H30\?2H", "90", str(x)))
df[output_column_name] = df[output_column_name].map(lambda x: re.sub("1H\?1H30", "60", str(x)))
df[output_column_name] = df[output_column_name].map(lambda x: re.sub("\\D", "", str(x)))
df[output_column_name] = df[output_column_name].map(lambda x:int(x) if x!='' else -1)
change_to_minute(train,'最寄駅:距離(分)','TimeToNearestStation')
change_to_minute(test,'最寄駅:距離(分)','TimeToNearestStation')
# 数値化2
def change_to_float(df,input_column_name,output_column_name):
# 50m以上は51にする
df[output_column_name] = df[input_column_name].map(lambda x: re.sub("50.0m以上", "51.0", str(x)))
#数値にする(Nullの場合はー1にする)
df[output_column_name] = df[output_column_name].map(lambda x:float(x) if x!='nan' else -1)
change_to_float(train,'間口',"Frontage")
change_to_float(test,'間口',"Frontage")
change_to_float(train,'前面道路:幅員(m)',"Breadth")
change_to_float(test,'前面道路:幅員(m)',"Breadth")
change_to_float(train,'建ぺい率(%)',"CoverageRatio")
change_to_float(test,'建ぺい率(%)',"CoverageRatio")
change_to_float(train,'容積率(%)',"FloorAreaRatio")
change_to_float(test,'容積率(%)',"FloorAreaRatio")
# 年の西暦化
def change_to_year(df,input_column_name,output_column_name):
# 戦前は昭和15年に置き換える
df[output_column_name] = df[input_column_name].map(lambda x: re.sub(r'戦前', '昭和15年', str(x)))
#昭和を西暦に変換する
df[output_column_name] = df[output_column_name].map(lambda x:int(re.sub("\\D", "", str(x)))+1925 if '昭和' in str(x) else x)
#平成を西暦に変換する
df[output_column_name] = df[output_column_name].map(lambda x:int(re.sub("\\D", "", str(x)))+1988 if '平成' in str(x) else x)
#nanを-1に変換する
df[output_column_name] = df[output_column_name].map(lambda x: -1 if x=='nan' else x)
# df = df.drop(input_column_name,axis=1)
change_to_year(train,'建築年','BuildingYear')
change_to_year(test,'建築年','BuildingYear')
# ターゲットエンコーディング
def change_to_target2(train_df,test_df,input_column_name,output_column_name):
from sklearn.model_selection import KFold
# nan埋め処理
train[input_column_name] = train[input_column_name].fillna('-1').isnull().sum()
test[input_column_name] = test[input_column_name].fillna('-1').isnull().sum()
kf = KFold(n_splits=5, shuffle=True, random_state=71)
#=========================================================#
c=input_column_name
# 学習データ全体で各カテゴリにおけるyの平均を計算
data_tmp = pd.DataFrame({c: train_df[c],'target':train_df['y']})
target_mean = data_tmp.groupby(c)['target'].mean()
#テストデータのカテゴリを置換
test_df[output_column_name] = test_df[c].map(target_mean)
# 変換後の値を格納する配列を準備
tmp = np.repeat(np.nan, train_df.shape[0])
for i, (train_index, test_index) in enumerate(kf.split(train_df)): # NFOLDS回まわる
#学習データについて、各カテゴリにおける目的変数の平均を計算
target_mean = data_tmp.iloc[train_index].groupby(c)['target'].mean()
#バリデーションデータについて、変換後の値を一時配列に格納
tmp[test_index] = train_df[c].iloc[test_index].map(target_mean)
#変換後のデータで元の変数を置換
train_df[output_column_name] = tmp
#========================================================#
change_to_target2(train,test,"種類","Type")
change_to_target2(train,test,"地域","Region")
change_to_target2(train,test,"市区町村コード","MunicipalityCode")
change_to_target2(train,test,"都道府県名","Prefecture")
change_to_target2(train,test,"市区町村名","Municipality")
change_to_target2(train,test,"地区名","DistrictName")
change_to_target2(train,test,"最寄駅:名称","NearestStation")
change_to_target2(train,test,"土地の形状","LandShape")
change_to_target2(train,test,"建物の構造","Structure")
change_to_target2(train,test,"用途","Use")
change_to_target2(train,test,"今後の利用目的","Purpose")
change_to_target2(train,test,"前面道路:方位","Direction")
change_to_target2(train,test,"前面道路:種類","Classification")
change_to_target2(train,test,"都市計画","CityPlanning")
change_to_target2(train,test,"改装", "Renovation")
change_to_target2(train,test,"取引の事情等","Remarks")
# 変換前の列を削除
jap_col = ['id', '種類', '地域', '市区町村コード', '都道府県名', '市区町村名', '地区名', '最寄駅:名称',
'最寄駅:距離(分)', '間取り', '面積(㎡)', '土地の形状', '間口', '延床面積(㎡)', '建築年', '建物の構造',
'用途', '今後の利用目的', '前面道路:方位', '前面道路:種類', '前面道路:幅員(m)', '都市計画', '建ぺい率(%)',
'容積率(%)', '取引時点', '改装', '取引の事情等']
train = train.drop(jap_col,axis=1)
test = test.drop(jap_col,axis=1)
# 訓練データとテストデータの列を確認
print(train.columns)
print(test.columns)
Index(['y', 'L', 'D', 'K', 'S', 'R', 'Maisonette', 'OpenFloor', 'Studio', 'RoomNum', 'TotalFloorArea', 'Area', 'Period', 'TimeToNearestStation', 'Frontage', 'Breadth', 'CoverageRatio', 'FloorAreaRatio', 'BuildingYear', 'Type', 'Region', 'MunicipalityCode', 'Prefecture', 'Municipality', 'DistrictName', 'NearestStation', 'LandShape', 'Structure', 'Use', 'Purpose', 'Direction', 'Classification', 'CityPlanning', 'Renovation', 'Remarks'], dtype='object') Index(['L', 'D', 'K', 'S', 'R', 'Maisonette', 'OpenFloor', 'Studio', 'RoomNum', 'TotalFloorArea', 'Area', 'Period', 'TimeToNearestStation', 'Frontage', 'Breadth', 'CoverageRatio', 'FloorAreaRatio', 'BuildingYear', 'Type', 'Region', 'MunicipalityCode', 'Prefecture', 'Municipality', 'DistrictName', 'NearestStation', 'LandShape', 'Structure', 'Use', 'Purpose', 'Direction', 'Classification', 'CityPlanning', 'Renovation', 'Remarks'], dtype='object')
# 訓練データに欠損がないことの確認
train.isnull().sum()
y 0 L 0 D 0 K 0 S 0 R 0 Maisonette 0 OpenFloor 0 Studio 0 RoomNum 0 TotalFloorArea 0 Area 0 Period 0 TimeToNearestStation 0 Frontage 0 Breadth 0 CoverageRatio 0 FloorAreaRatio 0 BuildingYear 0 Type 0 Region 0 MunicipalityCode 0 Prefecture 0 Municipality 0 DistrictName 0 NearestStation 0 LandShape 0 Structure 0 Use 0 Purpose 0 Direction 0 Classification 0 CityPlanning 0 Renovation 0 Remarks 0 dtype: int64
# テストデータに欠損がないことの確認
test.isnull().sum()
L 0 D 0 K 0 S 0 R 0 Maisonette 0 OpenFloor 0 Studio 0 RoomNum 0 TotalFloorArea 0 Area 0 Period 0 TimeToNearestStation 0 Frontage 0 Breadth 0 CoverageRatio 0 FloorAreaRatio 0 BuildingYear 0 Type 0 Region 0 MunicipalityCode 0 Prefecture 0 Municipality 0 DistrictName 0 NearestStation 0 LandShape 0 Structure 0 Use 0 Purpose 0 Direction 0 Classification 0 CityPlanning 0 Renovation 0 Remarks 0 dtype: int64
# 訓練データを説明変数と目的変数に分割
target = np.log(train['y']) # RMSLEを求めるためにlogをとる
train_x = train.drop('y',axis=1)
# 訓練データから検証用データを作成
X_train, X_test, y_train, y_test = train_test_split(train_x, target, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)
# LGBMのパラメータを設定
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
'num_leaves': 31,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0,
'n_estimators':20000,
'early_stopping_rounds':1000,
'seed': 42
}
# 学習の実行
model = lgb.train(params,
lgb_train,
valid_names=['train', 'valid'],
valid_sets=[lgb_train, lgb_valid],
verbose_eval=200)
Training until validation scores don't improve for 1000 rounds [200] train's rmse: 0.510617 valid's rmse: 0.512846 [400] train's rmse: 0.467615 valid's rmse: 0.471829 [600] train's rmse: 0.454878 valid's rmse: 0.460003 [800] train's rmse: 0.448517 valid's rmse: 0.454627 [1000] train's rmse: 0.444594 valid's rmse: 0.451654 [1200] train's rmse: 0.441706 valid's rmse: 0.44966 [1400] train's rmse: 0.43958 valid's rmse: 0.4483 [1600] train's rmse: 0.437561 valid's rmse: 0.44712 [1800] train's rmse: 0.435834 valid's rmse: 0.446286 [2000] train's rmse: 0.434292 valid's rmse: 0.445535 [2200] train's rmse: 0.432887 valid's rmse: 0.444923 [2400] train's rmse: 0.431582 valid's rmse: 0.444364 [2600] train's rmse: 0.430375 valid's rmse: 0.444005 [2800] train's rmse: 0.429199 valid's rmse: 0.443627 [3000] train's rmse: 0.428064 valid's rmse: 0.443247 [3200] train's rmse: 0.426985 valid's rmse: 0.442905 [3400] train's rmse: 0.426062 valid's rmse: 0.442629 [3600] train's rmse: 0.425058 valid's rmse: 0.442326 [3800] train's rmse: 0.424232 valid's rmse: 0.442072 [4000] train's rmse: 0.423273 valid's rmse: 0.441794 [4200] train's rmse: 0.422327 valid's rmse: 0.441642 [4400] train's rmse: 0.421518 valid's rmse: 0.441471 [4600] train's rmse: 0.420683 valid's rmse: 0.441286 [4800] train's rmse: 0.419912 valid's rmse: 0.441178 [5000] train's rmse: 0.419078 valid's rmse: 0.441014 [5200] train's rmse: 0.418339 valid's rmse: 0.440855 [5400] train's rmse: 0.417618 valid's rmse: 0.440717 [5600] train's rmse: 0.416885 valid's rmse: 0.440615 [5800] train's rmse: 0.416171 valid's rmse: 0.440444 [6000] train's rmse: 0.415472 valid's rmse: 0.440298 [6200] train's rmse: 0.414765 valid's rmse: 0.440191 [6400] train's rmse: 0.414065 valid's rmse: 0.440094 [6600] train's rmse: 0.413412 valid's rmse: 0.440025 [6800] train's rmse: 0.412772 valid's rmse: 0.43997 [7000] train's rmse: 0.412178 valid's rmse: 0.439854 [7200] train's rmse: 0.411563 valid's rmse: 0.439766 [7400] train's rmse: 0.410977 valid's rmse: 0.439697 [7600] train's rmse: 0.410427 valid's rmse: 0.439674 [7800] train's rmse: 0.409802 valid's rmse: 0.439618 [8000] train's rmse: 0.409166 valid's rmse: 0.43949 [8200] train's rmse: 0.408573 valid's rmse: 0.439464 [8400] train's rmse: 0.408041 valid's rmse: 0.439386 [8600] train's rmse: 0.407454 valid's rmse: 0.439233 [8800] train's rmse: 0.406891 valid's rmse: 0.439193 [9000] train's rmse: 0.40632 valid's rmse: 0.439086 [9200] train's rmse: 0.405757 valid's rmse: 0.439065 [9400] train's rmse: 0.405243 valid's rmse: 0.438984 [9600] train's rmse: 0.404755 valid's rmse: 0.438947 [9800] train's rmse: 0.404201 valid's rmse: 0.438996 [10000] train's rmse: 0.40369 valid's rmse: 0.438976 [10200] train's rmse: 0.40317 valid's rmse: 0.438967 [10400] train's rmse: 0.402662 valid's rmse: 0.438933 [10600] train's rmse: 0.402217 valid's rmse: 0.438961 [10800] train's rmse: 0.40169 valid's rmse: 0.438879 [11000] train's rmse: 0.401236 valid's rmse: 0.438865 [11200] train's rmse: 0.400762 valid's rmse: 0.438877 [11400] train's rmse: 0.400244 valid's rmse: 0.438806 [11600] train's rmse: 0.399784 valid's rmse: 0.438796 [11800] train's rmse: 0.399317 valid's rmse: 0.438767 [12000] train's rmse: 0.398832 valid's rmse: 0.438731 [12200] train's rmse: 0.398355 valid's rmse: 0.438712 [12400] train's rmse: 0.397905 valid's rmse: 0.438737 [12600] train's rmse: 0.39749 valid's rmse: 0.438715 [12800] train's rmse: 0.397064 valid's rmse: 0.438728 [13000] train's rmse: 0.396657 valid's rmse: 0.438706 [13200] train's rmse: 0.396231 valid's rmse: 0.438713 [13400] train's rmse: 0.39578 valid's rmse: 0.43873 [13600] train's rmse: 0.395341 valid's rmse: 0.438728 Early stopping, best iteration is: [12660] train's rmse: 0.397355 valid's rmse: 0.438692
# 予測の実行
predicts = model.predict(test)
predicts.shape
(34844,)
# 学習時に目的変数のlogをとっているのでもとに戻す
predicts = np.exp(predicts)
# 提出用ファイルを作成する
pd.DataFrame({"id": range(len(predicts)), "y": predicts }).to_csv("001_submission.csv", index=False)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from matplotlib import pyplot as plt
"""LightGBM を使った特徴量の重要度の可視化"""
# 特徴量の重要度をプロットする
lgb.plot_importance(model, figsize=(12, 6))
plt.show()