import warnings
import os
from pathlib import Path
import pandas as pd
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub
from tensorflow import keras
from numba import cuda
import torch
import transformers
from transformers import BertTokenizer
from tqdm import tqdm
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 25)
ROOT_DIR = Path('../')
DATA_DIR = ROOT_DIR / Path('data')
class Config:
N_FOLD = 5
train_df = pd.read_csv(DATA_DIR / Path('train_data.csv'))
test_df = pd.read_csv(DATA_DIR / Path('test_data.csv'))
station_list_df = pd.read_csv(DATA_DIR / Path('station_list.csv'))
submission_df = pd.read_csv(DATA_DIR / Path('submission.csv'))
(9990, 13)
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | y | |
0 | 1 | KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre... | 242899459 | Koto Ku | 35.68185 | 139.80310 | Entire home/apt | 1 | 55 | 2020-04-25 | 2.21 | 173 | 12008 |
1 | 2 | Downtown Tokyo Iriya next to Ueno | 308879948 | Taito Ku | 35.72063 | 139.78536 | Entire home/apt | 6 | 72 | 2020-03-25 | 2.11 | 9 | 6667 |
2 | 3 | Japan Style,Private,Affordable,4min to Sta. | 300877823 | Katsushika Ku | 35.74723 | 139.82349 | Entire home/apt | 1 | 18 | 2020-03-23 | 3.46 | 288 | 9923 |
3 | 4 | 4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi | 236935461 | Shibuya Ku | 35.68456 | 139.68077 | Entire home/apt | 1 | 2 | 2020-04-02 | 1.76 | 87 | 8109 |
4 | 5 | LICENSED SHINJUKU HOUSE: Heart of the action! | 243408889 | Shinjuku Ku | 35.69840 | 139.70467 | Entire home/apt | 1 | 86 | 2020-01-30 | 2.00 | 156 | 100390 |
(4996, 12)
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | |
0 | 1 | 5-minute walk from Akasaka Sta, Superior double | 184730720 | Minato Ku | 35.67131 | 139.73285 | Private room | 1 | 0 | NaN | NaN | 183 |
1 | 2 | 7 min Sta.-Center of IKEBUKURO Cozy Room#503 | 20993205 | Toshima Ku | 35.73014 | 139.71739 | Entire home/apt | 2 | 21 | 2020-04-16 | 1.94 | 337 |
2 | 3 | Designer'sApt 1min sta☆Shinjuku 7min☆Shibuya 4min | 322521715 | Setagaya Ku | 35.66193 | 139.66540 | Entire home/apt | 1 | 14 | 2020-02-12 | 0.82 | 240 |
3 | 4 | Komagome Station 2 minutes on foot | 234477095 | Toshima Ku | 35.73603 | 139.74794 | Entire home/apt | 1 | 16 | 2020-02-17 | 1.19 | 0 |
4 | 5 | Monthly/Metro1min/JR5min/Ueno,Asakusa,Akihabara | 145453833 | Taito Ku | 35.72126 | 139.78320 | Entire home/apt | 30 | 2 | 2019-07-21 | 0.19 | 164 |
station_name | longitude | latitude | |
0 | 白丸 | 139.114861 | 35.811735 |
1 | 古里 | 139.152102 | 35.816247 |
2 | 川井 | 139.164290 | 35.813697 |
3 | 御嶽 | 139.182589 | 35.801468 |
4 | 沢井 | 139.193324 | 35.805940 |
train_df['y_log'] = train_df['y'].apply(np.log)
for room_type, room_type_df in train_df.groupby('room_type'):
sns.distplot(room_type_df['y_log'], label=room_type)
<matplotlib.legend.Legend at 0x1fabd8a8b50>
station_pos_array = []
for _, station in station_list_df.iterrows():
b = np.array([station['latitude'], station['longitude']])
station_pos_array.append((station['station_name'], b))
nearest = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
a = np.array([row['latitude'], row['longitude']])
dist_list = []
for station_name, b in station_pos_array:
dist = np.linalg.norm(a - b)
station_dist_all_df = pd.DataFrame(dist_list)
station_dist_df = station_dist_all_df.sort_values('distance').iloc[0]
nearest_station_name = station_dist_df['station_name']
distance = station_dist_df['distance']
dist_under_001_count = (station_dist_all_df['distance'] < 0.01).sum()
dist_under_005_count = (station_dist_all_df['distance'] < 0.05).sum()
dist_under_01_count = (station_dist_all_df['distance'] < 0.1).sum()
'station_name': nearest_station_name,
'distance': distance,
'dist_under_001_count': dist_under_001_count,
'dist_under_005_count': dist_under_005_count,
'dist_under_01_count': dist_under_01_count,
train_df = train_df.merge(pd.DataFrame(nearest), how='left', on='id')
100%|█████████████████████████████████████████████████████████████████████████████| 9990/9990 [00:55<00:00, 181.03it/s]
nearest = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
a = np.array([row['latitude'], row['longitude']])
dist_list = []
for station_name, b in station_pos_array:
dist = np.linalg.norm(a - b)
station_dist_all_df = pd.DataFrame(dist_list)
station_dist_df = station_dist_all_df.sort_values('distance').iloc[0]
nearest_station_name = station_dist_df['station_name']
distance = station_dist_df['distance']
dist_under_001_count = (station_dist_all_df['distance'] < 0.01).sum()
dist_under_005_count = (station_dist_all_df['distance'] < 0.05).sum()
dist_under_01_count = (station_dist_all_df['distance'] < 0.1).sum()
'station_name': nearest_station_name,
'distance': distance,
'dist_under_001_count': dist_under_001_count,
'dist_under_005_count': dist_under_005_count,
'dist_under_01_count': dist_under_01_count,
test_df = test_df.merge(pd.DataFrame(nearest), how='left', on='id')
100%|█████████████████████████████████████████████████████████████████████████████| 4996/4996 [00:27<00:00, 179.75it/s]
train_df['name'] = train_df['name'].apply(lambda x : ' '.join(x.split('/')))
test_df['name'] = test_df['name'].apply(lambda x : ' '.join(x.split('/')))
train_df['name'] = train_df['name'].apply(lambda x : ' '.join(x.split('|')))
test_df['name'] = test_df['name'].apply(lambda x : ' '.join(x.split('|')))
train_df['word_count'] = train_df['name'].apply(lambda x : len(x.split()))
test_df['word_count'] = test_df['name'].apply(lambda x : len(x.split()))
train_df['name_len'] = train_df['name'].apply(lambda x : len(x))
test_df['name_len'] = test_df['name'].apply(lambda x : len(x))
# 意味ありそうな単語
train_df['is_wifi'] = train_df['name'].apply(lambda x : 'wi-fi' in x.lower() or 'wifi' in x.lower())
test_df['is_wifi'] = test_df['name'].apply(lambda x : 'wi-fi' in x.lower() or 'wifi' in x.lower())
train_df['is_free'] = train_df['name'].apply(lambda x : 'free' in x.lower())
test_df['is_free'] = test_df['name'].apply(lambda x : 'free' in x.lower())
train_df['is_min'] = train_df['name'].apply(lambda x : 'min ' in x.lower())
test_df['is_min'] = test_df['name'].apply(lambda x : 'min ' in x.lower())
train_df['is_skytree'] = train_df['name'].apply(lambda x : 'skytree' in x.lower())
test_df['is_skytree'] = test_df['name'].apply(lambda x : 'skytree' in x.lower())
train_df['is_sale'] = train_df['name'].apply(lambda x : 'sale' in x.lower())
test_df['is_sale'] = test_df['name'].apply(lambda x : 'sale' in x.lower())
train_df['is_star'] = train_df['name'].apply(lambda x : '★' in x.lower() or '☆' in x.lower())
test_df['is_star'] = test_df['name'].apply(lambda x : '★' in x.lower() or '☆' in x.lower())
all_df = pd.concat([train_df, test_df])
tv = TfidfVectorizer(min_df=20)
features = tv.fit_transform(all_df["name"])
train_df = pd.concat([train_df, pd.DataFrame(features.toarray()[:len(train_df)]).add_prefix('TFIDF_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(features.toarray()[len(train_df):]).add_prefix('TFIDF_')], axis=1)
svd = TruncatedSVD(n_components=20, random_state=Config.RANDOM_SATE)
svd_features = svd.fit_transform(features.toarray())
train_df = pd.concat([train_df, pd.DataFrame(svd_features[:len(train_df)]).add_prefix('SVD_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(svd_features[len(train_df):]).add_prefix('SVD_')], axis=1)
from sklearn.decomposition import NMF
all_df = pd.concat([train_df, test_df])
embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
features = np.stack(all_df["name"].fillna("").progress_apply(lambda x: embedder(x).numpy().reshape(-1)).values)
100%|████████████████████████████████████████████████████████████████████████████| 14986/14986 [04:11<00:00, 59.66it/s]
svd = TruncatedSVD(n_components=50, random_state=Config.RANDOM_SATE)
svd_features = svd.fit_transform(features)
train_df = pd.concat([train_df, pd.DataFrame(svd_features[:len(train_df)]).add_prefix('USE_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(svd_features[len(train_df):]).add_prefix('USE_')], axis=1)
del embedder
class BertSequenceVectorizer:
def __init__(self, model_name="bert-base-uncased", max_len=128):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model_name = model_name
self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
self.bert_model = self.bert_model.to(self.device)
self.max_len = max_len
def vectorize(self, sentence: str) -> np.array:
inp = self.tokenizer.encode(sentence)
len_inp = len(inp)
if len_inp >= self.max_len:
inputs = inp[:self.max_len]
masks = [1] * self.max_len
inputs = inp + [0] * (self.max_len - len_inp)
masks = [1] * len_inp + [0] * (self.max_len - len_inp)
inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
bert_out = self.bert_model(inputs_tensor, masks_tensor)
seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']
if torch.cuda.is_available():
return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
return seq_out[0][0].detach().numpy()
all_df = pd.concat([train_df, test_df])
BSV = BertSequenceVectorizer(model_name="bert-base-multilingual-uncased", max_len=128)
features = np.stack(all_df["name"].fillna("").map(lambda x: BSV.vectorize(x).reshape(-1)).values)
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight'] - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Wall time: 3min 24s
pca = PCA(n_components=20)
pca_features = pca.fit_transform(features)
train_df = pd.concat([train_df, pd.DataFrame(pca_features[:len(train_df)]).add_prefix('BERT_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(pca_features[len(train_df):]).add_prefix('BERT_')], axis=1)
train_df['last_review_year'] = pd.to_datetime(train_df['last_review']).dt.year
train_df['last_review_month'] = pd.to_datetime(train_df['last_review']).dt.month
test_df['last_review_year'] = pd.to_datetime(test_df['last_review']).dt.year
test_df['last_review_month'] = pd.to_datetime(test_df['last_review']).dt.month
train_df['review_month'] = train_df['number_of_reviews'] / train_df['number_of_reviews']
test_df['review_month'] = test_df['number_of_reviews'] / test_df['number_of_reviews']
clf = mixture.GaussianMixture(n_components=10, covariance_type='full')
gm = clf.fit_predict(pd.concat([train_df, test_df])[['latitude','longitude']])
train_df['gmm'] = gm[:len(train_df)]
test_df['gmm'] = gm[len(train_df):]
"""gmm_proba = clf.predict_proba(pd.concat([train_df, test_df])[['latitude','longitude']])
train_df = pd.concat([train_df, pd.DataFrame(gmm_proba[:len(train_df)]).add_prefix('GMM_PROBA_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(gmm_proba[len(train_df):]).add_prefix('GMM_PROBA_')], axis=1)"""
"gmm_proba = clf.predict_proba(pd.concat([train_df, test_df])[['latitude','longitude']])\ntrain_df = pd.concat([train_df, pd.DataFrame(gmm_proba[:len(train_df)]).add_prefix('GMM_PROBA_')], axis=1)\ntest_df = pd.concat([test_df, pd.DataFrame(gmm_proba[len(train_df):]).add_prefix('GMM_PROBA_')], axis=1)"
plt.figure(figsize=(15, 15))
sns.scatterplot(train_df['latitude'], train_df['longitude'], hue=train_df['y_log'])
<AxesSubplot:xlabel='latitude', ylabel='longitude'>
## 重心からの距離
all_df = pd.concat([train_df, test_df])
lat = all_df['latitude'].mean()
long = all_df['longitude'].mean()
centroid = (lat, long)
centroid_dist = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
a = np.array([row['latitude'], row['longitude']])
centroid_dist.append(np.linalg.norm(a - centroid))
train_df['centroid_dist'] = centroid_dist
centroid_dist = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
a = np.array([row['latitude'], row['longitude']])
centroid_dist.append(np.linalg.norm(a - centroid))
test_df['centroid_dist'] = centroid_dist
100%|███████████████████████████████████████████████████████████████████████████| 9990/9990 [00:00<00:00, 14796.65it/s] 100%|███████████████████████████████████████████████████████████████████████████| 4996/4996 [00:00<00:00, 12676.14it/s]
target = 'y_log'
del_columns = ['name', 'y', 'y_log', 'room_type', 'room_type', 'last_review', 'neighbourhood', 'host_id', 'id', 'station_name', 'gmm']
features = list(set(train_df.columns) - set(del_columns))
['USE_4', 'TFIDF_504', 'TFIDF_283', 'TFIDF_12', 'USE_1', 'TFIDF_229', 'SVD_1', 'TFIDF_68', 'BERT_0', 'TFIDF_429', 'TFIDF_316', 'TFIDF_277', 'TFIDF_19', 'TFIDF_280', 'TFIDF_447', 'dist_under_005_count', 'TFIDF_245', 'TFIDF_518', 'TFIDF_392', 'BERT_12', 'TFIDF_246', 'BERT_13', 'TFIDF_200', 'TFIDF_210', 'TFIDF_175', 'TFIDF_465', 'TFIDF_127', 'TFIDF_24', 'TFIDF_349', 'USE_20', 'TFIDF_257', 'TFIDF_160', 'TFIDF_219', 'TFIDF_174', 'TFIDF_406', 'TFIDF_516', 'TFIDF_488', 'latitude', 'TFIDF_265', 'USE_14', 'TFIDF_393', 'USE_27', 'TFIDF_44', 'TFIDF_81', 'USE_8', 'SVD_19', 'BERT_15', 'TFIDF_10', 'TFIDF_284', 'USE_41', 'TFIDF_251', 'TFIDF_49', 'USE_43', 'TFIDF_473', 'TFIDF_240', 'TFIDF_168', 'TFIDF_43', 'TFIDF_181', 'TFIDF_119', 'SVD_12', 'TFIDF_163', 'TFIDF_165', 'TFIDF_35', 'TFIDF_92', 'TFIDF_64', 'TFIDF_0', 'TFIDF_338', 'TFIDF_38', 'TFIDF_102', 'TFIDF_39', 'TFIDF_80', 'TFIDF_421', 'TFIDF_228', 'TFIDF_466', 'TFIDF_501', 'TFIDF_86', 'TFIDF_339', 'TFIDF_218', 'TFIDF_404', 'TFIDF_198', 'TFIDF_118', 'TFIDF_153', 'TFIDF_419', 'TFIDF_356', 'USE_38', 'TFIDF_440', 'TFIDF_343', 'TFIDF_484', 'TFIDF_411', 'TFIDF_32', 'USE_19', 'TFIDF_348', 'TFIDF_30', 'USE_46', 'TFIDF_471', 'USE_42', 'USE_33', 'TFIDF_413', 'TFIDF_409', 'TFIDF_111', 'TFIDF_199', 'TFIDF_370', 'TFIDF_117', 'TFIDF_247', 'TFIDF_164', 'TFIDF_264', 'TFIDF_369', 'TFIDF_205', 'TFIDF_363', 'TFIDF_162', 'TFIDF_482', 'USE_34', 'TFIDF_5', 'TFIDF_121', 'TFIDF_298', 'TFIDF_454', 'TFIDF_242', 'USE_48', 'TFIDF_311', 'TFIDF_282', 'TFIDF_241', 'TFIDF_397', 'TFIDF_486', 'TFIDF_511', 'TFIDF_427', 'TFIDF_360', 'TFIDF_137', 'TFIDF_295', 'TFIDF_91', 'TFIDF_45', 'TFIDF_148', 'TFIDF_498', 'TFIDF_250', 'TFIDF_63', 'TFIDF_291', 'TFIDF_191', 'TFIDF_286', 'TFIDF_54', 'TFIDF_314', 'SVD_7', 'TFIDF_461', 'TFIDF_439', 'TFIDF_326', 'TFIDF_459', 'TFIDF_248', 'TFIDF_176', 'TFIDF_154', 'TFIDF_72', 'TFIDF_288', 'TFIDF_336', 'SVD_11', 'TFIDF_455', 'TFIDF_53', 'TFIDF_236', 'TFIDF_350', 'TFIDF_126', 'TFIDF_494', 'TFIDF_243', 'BERT_8', 'TFIDF_462', 'BERT_6', 'BERT_11', 'TFIDF_319', 'minimum_nights', 'TFIDF_161', 'TFIDF_217', 'is_skytree', 'USE_12', 'TFIDF_359', 'TFIDF_268', 'TFIDF_129', 'TFIDF_116', 'TFIDF_249', 'TFIDF_307', 'USE_18', 'TFIDF_436', 'TFIDF_479', 'TFIDF_52', 'TFIDF_100', 'TFIDF_365', 'TFIDF_262', 'BERT_4', 'TFIDF_513', 'TFIDF_76', 'USE_11', 'TFIDF_4', 'TFIDF_524', 'TFIDF_366', 'TFIDF_502', 'TFIDF_211', 'TFIDF_169', 'TFIDF_70', 'TFIDF_353', 'TFIDF_46', 'TFIDF_415', 'BERT_2', 'TFIDF_96', 'TFIDF_170', 'TFIDF_27', 'TFIDF_171', 'TFIDF_254', 'TFIDF_7', 'TFIDF_364', 'TFIDF_3', 'TFIDF_224', 'TFIDF_297', 'TFIDF_444', 'TFIDF_235', 'TFIDF_90', 'TFIDF_394', 'TFIDF_327', 'TFIDF_213', 'TFIDF_214', 'TFIDF_95', 'TFIDF_281', 'TFIDF_259', 'TFIDF_390', 'TFIDF_467', 'TFIDF_189', 'TFIDF_523', 'TFIDF_414', 'TFIDF_58', 'TFIDF_113', 'TFIDF_216', 'TFIDF_8', 'USE_9', 'TFIDF_22', 'TFIDF_396', 'TFIDF_352', 'TFIDF_226', 'BERT_17', 'BERT_5', 'TFIDF_238', 'TFIDF_183', 'BERT_19', 'BERT_7', 'TFIDF_474', 'SVD_3', 'TFIDF_489', 'TFIDF_492', 'TFIDF_74', 'TFIDF_458', 'TFIDF_508', 'TFIDF_204', 'TFIDF_418', 'TFIDF_212', 'TFIDF_434', 'TFIDF_500', 'SVD_8', 'TFIDF_362', 'TFIDF_50', 'TFIDF_313', 'TFIDF_227', 'TFIDF_20', 'TFIDF_472', 'TFIDF_422', 'TFIDF_133', 'TFIDF_21', 'TFIDF_437', 'TFIDF_445', 'TFIDF_14', 'TFIDF_167', 'TFIDF_255', 'TFIDF_158', 'TFIDF_412', 'TFIDF_180', 'TFIDF_194', 'TFIDF_342', 'TFIDF_490', 'TFIDF_517', 'TFIDF_293', 'TFIDF_410', 'is_min', 'TFIDF_304', 'TFIDF_185', 'TFIDF_468', 'TFIDF_41', 'TFIDF_231', 'dist_under_001_count', 'TFIDF_425', 'USE_45', 'TFIDF_157', 'TFIDF_144', 'TFIDF_75', 'TFIDF_193', 'TFIDF_503', 'USE_40', 'TFIDF_428', 'TFIDF_256', 'TFIDF_42', 'BERT_1', 'SVD_15', 'USE_3', 'TFIDF_367', 'TFIDF_519', 'TFIDF_108', 'TFIDF_239', 'TFIDF_452', 'TFIDF_62', 'TFIDF_275', 'TFIDF_186', 'TFIDF_206', 'TFIDF_120', 'TFIDF_388', 'TFIDF_376', 'TFIDF_78', 'TFIDF_505', 'TFIDF_375', 'TFIDF_260', 'TFIDF_442', 'TFIDF_83', 'is_sale', 'TFIDF_51', 'SVD_9', 'TFIDF_521', 'TFIDF_136', 'TFIDF_37', 'TFIDF_82', 'TFIDF_207', 'TFIDF_61', 'TFIDF_485', 'TFIDF_98', 'longitude', 'TFIDF_147', 'TFIDF_292', 'TFIDF_94', 'TFIDF_139', 'USE_22', 'TFIDF_202', 'TFIDF_496', 'BERT_3', 'reviews_per_month', 'TFIDF_497', 'TFIDF_483', 'USE_5', 'TFIDF_187', 'TFIDF_506', 'SVD_16', 'TFIDF_79', 'TFIDF_493', 'TFIDF_177', 'TFIDF_223', 'TFIDF_300', 'TFIDF_478', 'TFIDF_1', 'BERT_16', 'TFIDF_345', 'is_wifi', 'TFIDF_306', 'TFIDF_289', 'review_month', 'TFIDF_317', 'TFIDF_460', 'TFIDF_267', 'TFIDF_273', 'TFIDF_18', 'TFIDF_310', 'TFIDF_522', 'TFIDF_26', 'TFIDF_299', 'USE_25', 'TFIDF_65', 'TFIDF_463', 'TFIDF_77', 'TFIDF_225', 'TFIDF_448', 'centroid_dist', 'SVD_5', 'TFIDF_196', 'TFIDF_303', 'SVD_17', 'TFIDF_232', 'TFIDF_220', 'TFIDF_296', 'TFIDF_449', 'TFIDF_263', 'TFIDF_464', 'TFIDF_272', 'TFIDF_104', 'TFIDF_354', 'TFIDF_182', 'TFIDF_40', 'TFIDF_328', 'BERT_18', 'TFIDF_287', 'TFIDF_309', 'TFIDF_266', 'TFIDF_368', 'TFIDF_374', 'TFIDF_88', 'TFIDF_209', 'TFIDF_337', 'TFIDF_128', 'TFIDF_357', 'TFIDF_384', 'USE_15', 'TFIDF_382', 'TFIDF_457', 'TFIDF_446', 'TFIDF_371', 'TFIDF_112', 'USE_26', 'TFIDF_258', 'TFIDF_308', 'USE_0', 'USE_39', 'TFIDF_134', 'TFIDF_285', 'TFIDF_107', 'SVD_14', 'TFIDF_109', 'TFIDF_203', 'TFIDF_330', 'TFIDF_402', 'TFIDF_244', 'TFIDF_130', 'TFIDF_150', 'TFIDF_201', 'TFIDF_221', 'TFIDF_395', 'TFIDF_387', 'TFIDF_332', 'USE_47', 'TFIDF_344', 'USE_49', 'TFIDF_125', 'TFIDF_347', 'TFIDF_451', 'TFIDF_190', 'TFIDF_234', 'TFIDF_115', 'TFIDF_315', 'TFIDF_443', 'TFIDF_140', 'TFIDF_379', 'TFIDF_340', 'TFIDF_495', 'SVD_2', 'TFIDF_401', 'TFIDF_331', 'TFIDF_416', 'TFIDF_333', 'TFIDF_346', 'TFIDF_33', 'TFIDF_97', 'TFIDF_135', 'TFIDF_290', 'TFIDF_405', 'TFIDF_105', 'TFIDF_253', 'TFIDF_114', 'TFIDF_145', 'TFIDF_301', 'TFIDF_73', 'TFIDF_407', 'TFIDF_93', 'TFIDF_358', 'TFIDF_279', 'TFIDF_514', 'USE_32', 'USE_23', 'TFIDF_274', 'TFIDF_152', 'word_count', 'TFIDF_380', 'USE_31', 'SVD_4', 'TFIDF_23', 'TFIDF_351', 'USE_29', 'TFIDF_433', 'TFIDF_55', 'TFIDF_85', 'TFIDF_391', 'TFIDF_16', 'TFIDF_302', 'TFIDF_420', 'TFIDF_67', 'USE_37', 'TFIDF_323', 'TFIDF_99', 'TFIDF_509', 'TFIDF_29', 'TFIDF_276', 'TFIDF_512', 'TFIDF_132', 'TFIDF_215', 'TFIDF_324', 'last_review_month', 'TFIDF_438', 'TFIDF_470', 'TFIDF_149', 'TFIDF_173', 'TFIDF_355', 'SVD_0', 'USE_13', 'TFIDF_403', 'TFIDF_34', 'TFIDF_383', 'TFIDF_398', 'BERT_14', 'TFIDF_507', 'TFIDF_208', 'TFIDF_110', 'TFIDF_408', 'availability_365', 'TFIDF_318', 'TFIDF_499', 'TFIDF_469', 'USE_2', 'TFIDF_386', 'TFIDF_15', 'TFIDF_9', 'TFIDF_11', 'USE_10', 'TFIDF_57', 'TFIDF_56', 'USE_21', 'TFIDF_233', 'TFIDF_400', 'is_star', 'TFIDF_60', 'TFIDF_271', 'BERT_9', 'TFIDF_69', 'TFIDF_172', 'TFIDF_278', 'USE_6', 'TFIDF_373', 'TFIDF_389', 'TFIDF_261', 'TFIDF_122', 'TFIDF_321', 'TFIDF_325', 'TFIDF_334', 'TFIDF_195', 'TFIDF_31', 'TFIDF_230', 'TFIDF_377', 'TFIDF_84', 'TFIDF_450', 'TFIDF_361', 'TFIDF_477', 'TFIDF_143', 'TFIDF_491', 'SVD_18', 'TFIDF_156', 'TFIDF_89', 'TFIDF_131', 'USE_30', 'TFIDF_124', 'TFIDF_179', 'dist_under_01_count', 'distance', 'TFIDF_166', 'TFIDF_252', 'USE_7', 'TFIDF_378', 'SVD_10', 'TFIDF_66', 'TFIDF_515', 'TFIDF_476', 'TFIDF_417', 'TFIDF_312', 'name_len', 'TFIDF_17', 'TFIDF_294', 'TFIDF_430', 'USE_16', 'TFIDF_155', 'TFIDF_184', 'TFIDF_141', 'TFIDF_103', 'TFIDF_59', 'TFIDF_424', 'TFIDF_441', 'TFIDF_480', 'TFIDF_372', 'TFIDF_475', 'USE_17', 'TFIDF_192', 'is_free', 'TFIDF_178', 'TFIDF_432', 'TFIDF_25', 'TFIDF_481', 'TFIDF_2', 'USE_35', 'TFIDF_48', 'USE_36', 'TFIDF_87', 'TFIDF_28', 'TFIDF_269', 'TFIDF_520', 'USE_24', 'TFIDF_101', 'TFIDF_335', 'TFIDF_381', 'TFIDF_399', 'TFIDF_142', 'TFIDF_487', 'TFIDF_159', 'number_of_reviews', 'last_review_year', 'TFIDF_453', 'TFIDF_188', 'TFIDF_222', 'TFIDF_237', 'TFIDF_151', 'USE_44', 'TFIDF_197', 'TFIDF_47', 'TFIDF_456', 'TFIDF_13', 'BERT_10', 'USE_28', 'TFIDF_270', 'TFIDF_6', 'TFIDF_146', 'TFIDF_341', 'TFIDF_123', 'TFIDF_106', 'TFIDF_385', 'TFIDF_138', 'SVD_13', 'TFIDF_71', 'TFIDF_435', 'SVD_6', 'TFIDF_431', 'TFIDF_510', 'TFIDF_423', 'TFIDF_320', 'TFIDF_36', 'TFIDF_305', 'TFIDF_329', 'TFIDF_322', 'TFIDF_426']
def evaluation(true, pred):
score = np.sqrt(mean_squared_error(true, pred))
return score
# LabelEncording
categoricals = ['room_type', 'neighbourhood', 'station_name', 'gmm']
features = list(set(features) | set(categoricals))
for c in categoricals:
le = LabelEncoder()
train_df[c] = le.transform(train_df[c].astype(str))
test_df[c] = le.transform(test_df[c].astype(str))
# Count encoding
count_enc = ['host_id', 'room_type', 'neighbourhood', 'station_name', 'gmm']
count_features = []
for c in count_enc:
_count = train_df.append(test_df)[c].astype(str).value_counts().to_dict()
train_df[f'{c}_count'] = train_df[c].astype(str).map(_count)
test_df[f'{c}_count'] = test_df[c].astype(str).map(_count)
features = list(set(features) | set(count_features))
params = {
'boosting_type': 'gbdt',
'metric': 'regression',
'objective': 'rmse',
'n_jobs': -1,
'seed': Config.RANDOM_SATE,
'learning_rate': 0.01,
# TargetEncording col
target_enc = ['neighbourhood', 'station_name', 'gmm']
target_enc_key = ['y_log']
oof_preds = np.zeros(len(train_df))
y_pred = np.zeros(len(test_df))
models = []
cv_scores = {}
skf = StratifiedGroupKFold(n_splits=Config.N_FOLD, random_state=Config.RANDOM_SATE, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(train_df[features], train_df['station_name'], train_df['host_id'])):
print(f'====== fold {fold} ======')
# TrainとTestに分割
x_train, x_val = train_df.copy().iloc[train_index][features], train_df.copy().iloc[test_index][features]
y_train, y_val = train_df.iloc[train_index][target], train_df.iloc[test_index][target]
test = test_df[features]
# Target Encoding
if len(target_enc) > 0:
for t in target_enc_key:
for c in target_enc:
x_train[f'{c}_target_enc_by_{t}'] = train_df.iloc[train_index][c].map(train_df.iloc[test_index].groupby(c)[t].mean().to_dict())
x_val[f'{c}_target_enc_by_{t}'] = train_df.iloc[test_index][c].map(train_df.groupby(c)[t].mean().to_dict())
test[f'{c}_target_enc_by_{t}'] = test_df[c].map(train_df.groupby(c)[t].mean().to_dict())
train_features = x_train.columns.to_list()
# create Dataset
train_set = lgb.Dataset(x_train, y_train, categorical_feature=categoricals, free_raw_data=False)
val_set = lgb.Dataset(x_val, y_val, categorical_feature=categoricals, free_raw_data=False)
# train
model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100, early_stopping_rounds=100)#, feval=rmsle_eval)
fold_pred = model.predict(x_val)
score = evaluation(y_val, fold_pred)
cv_scores[f'cv{fold}'] = score
oof_preds[test_index] = fold_pred
y_pred += model.predict(test) / Config.N_FOLD
print(f'cv score is {score}')
oof_score = evaluation(train_df[target], oof_preds)
print(f'OOF score is {oof_score}')
====== fold 0 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020747 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 37060 [LightGBM] [Info] Number of data points in the train set: 8084, number of used features: 469 [LightGBM] [Info] Start training from score 9.485144 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.488048 valid_1's l2: 0.698058 [200] training's l2: 0.355449 valid_1's l2: 0.640509 [300] training's l2: 0.286456 valid_1's l2: 0.620083 [400] training's l2: 0.23974 valid_1's l2: 0.607584 [500] training's l2: 0.206211 valid_1's l2: 0.598706 [600] training's l2: 0.180307 valid_1's l2: 0.593431 [700] training's l2: 0.159994 valid_1's l2: 0.589344 [800] training's l2: 0.14347 valid_1's l2: 0.585377 [900] training's l2: 0.129461 valid_1's l2: 0.583878 [1000] training's l2: 0.117605 valid_1's l2: 0.581992 [1100] training's l2: 0.107127 valid_1's l2: 0.579298 [1200] training's l2: 0.098341 valid_1's l2: 0.577533 [1300] training's l2: 0.0905596 valid_1's l2: 0.575875 Early stopping, best iteration is: [1282] training's l2: 0.0918336 valid_1's l2: 0.575756 cv score is 0.7587861164409755 ====== fold 1 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030324 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 37074 [LightGBM] [Info] Number of data points in the train set: 8061, number of used features: 472 [LightGBM] [Info] Start training from score 9.433627 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.490696 valid_1's l2: 0.723275 [200] training's l2: 0.348588 valid_1's l2: 0.677438 [300] training's l2: 0.276958 valid_1's l2: 0.661627 [400] training's l2: 0.23099 valid_1's l2: 0.659778 [500] training's l2: 0.1973 valid_1's l2: 0.657879 [600] training's l2: 0.172069 valid_1's l2: 0.654605 [700] training's l2: 0.152057 valid_1's l2: 0.650254 [800] training's l2: 0.136132 valid_1's l2: 0.647464 [900] training's l2: 0.122567 valid_1's l2: 0.645914 [1000] training's l2: 0.111722 valid_1's l2: 0.645856 Early stopping, best iteration is: [950] training's l2: 0.116976 valid_1's l2: 0.645199 cv score is 0.8032430628385007 ====== fold 2 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020039 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 36694 [LightGBM] [Info] Number of data points in the train set: 7858, number of used features: 458 [LightGBM] [Info] Start training from score 9.480055 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.501974 valid_1's l2: 0.641378 [200] training's l2: 0.358455 valid_1's l2: 0.621378 [300] training's l2: 0.287479 valid_1's l2: 0.609193 [400] training's l2: 0.240488 valid_1's l2: 0.599782 [500] training's l2: 0.206102 valid_1's l2: 0.592426 [600] training's l2: 0.179476 valid_1's l2: 0.586709 [700] training's l2: 0.158619 valid_1's l2: 0.583386 [800] training's l2: 0.141267 valid_1's l2: 0.581166 [900] training's l2: 0.127166 valid_1's l2: 0.579021 [1000] training's l2: 0.115052 valid_1's l2: 0.577815 [1100] training's l2: 0.104588 valid_1's l2: 0.576488 [1200] training's l2: 0.0956801 valid_1's l2: 0.575222 [1300] training's l2: 0.0879042 valid_1's l2: 0.574342 [1400] training's l2: 0.0809795 valid_1's l2: 0.573692 [1500] training's l2: 0.0748301 valid_1's l2: 0.572872 [1600] training's l2: 0.0692713 valid_1's l2: 0.572203 [1700] training's l2: 0.0641214 valid_1's l2: 0.571866 [1800] training's l2: 0.0596875 valid_1's l2: 0.571812 [1900] training's l2: 0.0557035 valid_1's l2: 0.571907 Early stopping, best iteration is: [1843] training's l2: 0.0579282 valid_1's l2: 0.571618 cv score is 0.7560543840213751 ====== fold 3 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029729 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 37088 [LightGBM] [Info] Number of data points in the train set: 8044, number of used features: 463 [LightGBM] [Info] Start training from score 9.460043 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.495351 valid_1's l2: 0.590806 [200] training's l2: 0.357956 valid_1's l2: 0.560963 [300] training's l2: 0.28442 valid_1's l2: 0.557662 [400] training's l2: 0.237224 valid_1's l2: 0.55367 [500] training's l2: 0.20328 valid_1's l2: 0.552406 [600] training's l2: 0.177356 valid_1's l2: 0.548525 [700] training's l2: 0.156841 valid_1's l2: 0.54736 [800] training's l2: 0.140504 valid_1's l2: 0.546186 [900] training's l2: 0.126994 valid_1's l2: 0.54589 [1000] training's l2: 0.115542 valid_1's l2: 0.54398 [1100] training's l2: 0.105512 valid_1's l2: 0.54394 Early stopping, best iteration is: [1073] training's l2: 0.108124 valid_1's l2: 0.54373 cv score is 0.737380513342473 ====== fold 4 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029168 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 36683 [LightGBM] [Info] Number of data points in the train set: 7913, number of used features: 469 [LightGBM] [Info] Start training from score 9.456076 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.459676 valid_1's l2: 1.13661 [200] training's l2: 0.335324 valid_1's l2: 1.11005 [300] training's l2: 0.269613 valid_1's l2: 1.09511 [400] training's l2: 0.225178 valid_1's l2: 1.08983 [500] training's l2: 0.192335 valid_1's l2: 1.08412 [600] training's l2: 0.167498 valid_1's l2: 1.0799 [700] training's l2: 0.14797 valid_1's l2: 1.07581 [800] training's l2: 0.132492 valid_1's l2: 1.07327 [900] training's l2: 0.119821 valid_1's l2: 1.07109 Early stopping, best iteration is: [894] training's l2: 0.120568 valid_1's l2: 1.07088 cv score is 1.0348309838938978 OOF score is 0.8276368501192828
"""df_importance = None
for i, model in enumerate(models):
if df_importance is None:
_df = pd.DataFrame([model.feature_importance(importance_type='gain'), train_features]).T
_df.columns = [f'model_{i}_gain', 'feature']
df_importance = _df
_df = pd.DataFrame([model.feature_importance(importance_type='gain'), train_features]).T
_df.columns = [f'model_{i}_gain', 'feature']
df_importance = df_importance.merge(_df, how='outer', on='feature')
df_imp = df_importance
df_imp['mean'] = df_imp[[f'model_{i}_gain' for i in range(len(models))]].mean(axis=1)
order = df_imp.sort_values('mean', ascending=False)['feature'].tolist()
df_imp = pd.melt(df_imp, id_vars=['feature'], value_vars=[f'model_{i}_gain' for i in range(len(models))])
df_imp['value'] = df_imp['value'].astype(float)
fig, ax = plt.subplots(figsize=(len(df_imp['feature'].drop_duplicates()) * .4, 5))
sns.boxenplot(x="feature", y="value", data=df_imp, order=order)
ax.tick_params(axis='x', rotation=90)
ax.set_title('feature importance')
'df_importance = None\n\nfor i, model in enumerate(models):\n if df_importance is None:\n _df = pd.DataFrame([model.feature_importance(importance_type=\'gain\'), train_features]).T\n _df.columns = [f\'model_{i}_gain\', \'feature\']\n df_importance = _df\n else:\n _df = pd.DataFrame([model.feature_importance(importance_type=\'gain\'), train_features]).T\n _df.columns = [f\'model_{i}_gain\', \'feature\']\n df_importance = df_importance.merge(_df, how=\'outer\', on=\'feature\')\n\ndf_imp = df_importance\ndf_imp[\'mean\'] = df_imp[[f\'model_{i}_gain\' for i in range(len(models))]].mean(axis=1)\norder = df_imp.sort_values(\'mean\', ascending=False)[\'feature\'].tolist()\n\ndf_imp = pd.melt(df_imp, id_vars=[\'feature\'], value_vars=[f\'model_{i}_gain\' for i in range(len(models))])\ndf_imp[\'value\'] = df_imp[\'value\'].astype(float)\n\nfig, ax = plt.subplots(figsize=(len(df_imp[\'feature\'].drop_duplicates()) * .4, 5))\nsns.boxenplot(x="feature", y="value", data=df_imp, order=order)\nax.tick_params(axis=\'x\', rotation=90)\nax.set_title(\'feature importance\')\nplt.show()'
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes = axes.reshape(-1)
sns.scatterplot(train_df['y_log'], oof_preds, ax=axes[0], alpha=0.5)
sns.distplot(train_df['y_log'], ax=axes[1], label='true')
sns.distplot(oof_preds, ax=axes[1], label='pred')
<matplotlib.legend.Legend at 0x1fb58f38f40>
test_df['y'] = np.exp(y_pred)
test_df[['id', 'y']].to_csv('../output/sub15.csv', index=False)