yuuuuki
import warnings
import os
from pathlib import Path
import pandas as pd
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub
from tensorflow import keras
from numba import cuda
import torch
import transformers
from transformers import BertTokenizer
from tqdm import tqdm
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 25)
ROOT_DIR = Path('../')
DATA_DIR = ROOT_DIR / Path('data')
class Config:
N_FOLD = 5
RANDOM_SATE = 42
train_df = pd.read_csv(DATA_DIR / Path('train_data.csv'))
test_df = pd.read_csv(DATA_DIR / Path('test_data.csv'))
station_list_df = pd.read_csv(DATA_DIR / Path('station_list.csv'))
submission_df = pd.read_csv(DATA_DIR / Path('submission.csv'))
display(train_df.shape)
display(train_df.head(5))
display(test_df.shape)
display(test_df.head(5))
display(station_list_df.head(5))
(9990, 13)
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre... | 242899459 | Koto Ku | 35.68185 | 139.80310 | Entire home/apt | 1 | 55 | 2020-04-25 | 2.21 | 173 | 12008 |
1 | 2 | Downtown Tokyo Iriya next to Ueno | 308879948 | Taito Ku | 35.72063 | 139.78536 | Entire home/apt | 6 | 72 | 2020-03-25 | 2.11 | 9 | 6667 |
2 | 3 | Japan Style,Private,Affordable,4min to Sta. | 300877823 | Katsushika Ku | 35.74723 | 139.82349 | Entire home/apt | 1 | 18 | 2020-03-23 | 3.46 | 288 | 9923 |
3 | 4 | 4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi | 236935461 | Shibuya Ku | 35.68456 | 139.68077 | Entire home/apt | 1 | 2 | 2020-04-02 | 1.76 | 87 | 8109 |
4 | 5 | LICENSED SHINJUKU HOUSE: Heart of the action! | 243408889 | Shinjuku Ku | 35.69840 | 139.70467 | Entire home/apt | 1 | 86 | 2020-01-30 | 2.00 | 156 | 100390 |
(4996, 12)
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5-minute walk from Akasaka Sta, Superior double | 184730720 | Minato Ku | 35.67131 | 139.73285 | Private room | 1 | 0 | NaN | NaN | 183 |
1 | 2 | 7 min Sta.-Center of IKEBUKURO Cozy Room#503 | 20993205 | Toshima Ku | 35.73014 | 139.71739 | Entire home/apt | 2 | 21 | 2020-04-16 | 1.94 | 337 |
2 | 3 | Designer'sApt 1min sta☆Shinjuku 7min☆Shibuya 4min | 322521715 | Setagaya Ku | 35.66193 | 139.66540 | Entire home/apt | 1 | 14 | 2020-02-12 | 0.82 | 240 |
3 | 4 | Komagome Station 2 minutes on foot | 234477095 | Toshima Ku | 35.73603 | 139.74794 | Entire home/apt | 1 | 16 | 2020-02-17 | 1.19 | 0 |
4 | 5 | Monthly/Metro1min/JR5min/Ueno,Asakusa,Akihabara | 145453833 | Taito Ku | 35.72126 | 139.78320 | Entire home/apt | 30 | 2 | 2019-07-21 | 0.19 | 164 |
station_name | longitude | latitude | |
---|---|---|---|
0 | 白丸 | 139.114861 | 35.811735 |
1 | 古里 | 139.152102 | 35.816247 |
2 | 川井 | 139.164290 | 35.813697 |
3 | 御嶽 | 139.182589 | 35.801468 |
4 | 沢井 | 139.193324 | 35.805940 |
train_df['y_log'] = train_df['y'].apply(np.log)
for room_type, room_type_df in train_df.groupby('room_type'):
sns.distplot(room_type_df['y_log'], label=room_type)
plt.legend()
<matplotlib.legend.Legend at 0x1fabd8a8b50>
station_pos_array = []
for _, station in station_list_df.iterrows():
b = np.array([station['latitude'], station['longitude']])
station_pos_array.append((station['station_name'], b))
nearest = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
a = np.array([row['latitude'], row['longitude']])
dist_list = []
for station_name, b in station_pos_array:
dist = np.linalg.norm(a - b)
dist_list.append({
'station_name':station_name,
'distance':dist
})
station_dist_all_df = pd.DataFrame(dist_list)
station_dist_df = station_dist_all_df.sort_values('distance').iloc[0]
nearest_station_name = station_dist_df['station_name']
distance = station_dist_df['distance']
dist_under_001_count = (station_dist_all_df['distance'] < 0.01).sum()
dist_under_005_count = (station_dist_all_df['distance'] < 0.05).sum()
dist_under_01_count = (station_dist_all_df['distance'] < 0.1).sum()
nearest.append({
'id':row['id'],
'station_name': nearest_station_name,
'distance': distance,
'dist_under_001_count': dist_under_001_count,
'dist_under_005_count': dist_under_005_count,
'dist_under_01_count': dist_under_01_count,
})
train_df = train_df.merge(pd.DataFrame(nearest), how='left', on='id')
100%|█████████████████████████████████████████████████████████████████████████████| 9990/9990 [00:55<00:00, 181.03it/s]
nearest = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
a = np.array([row['latitude'], row['longitude']])
dist_list = []
for station_name, b in station_pos_array:
dist = np.linalg.norm(a - b)
dist_list.append({
'station_name':station_name,
'distance':dist
})
station_dist_all_df = pd.DataFrame(dist_list)
station_dist_df = station_dist_all_df.sort_values('distance').iloc[0]
nearest_station_name = station_dist_df['station_name']
distance = station_dist_df['distance']
dist_under_001_count = (station_dist_all_df['distance'] < 0.01).sum()
dist_under_005_count = (station_dist_all_df['distance'] < 0.05).sum()
dist_under_01_count = (station_dist_all_df['distance'] < 0.1).sum()
nearest.append({
'id':row['id'],
'station_name': nearest_station_name,
'distance': distance,
'dist_under_001_count': dist_under_001_count,
'dist_under_005_count': dist_under_005_count,
'dist_under_01_count': dist_under_01_count,
})
test_df = test_df.merge(pd.DataFrame(nearest), how='left', on='id')
100%|█████████████████████████████████████████████████████████████████████████████| 4996/4996 [00:27<00:00, 179.75it/s]
train_df['name'] = train_df['name'].apply(lambda x : ' '.join(x.split('/')))
test_df['name'] = test_df['name'].apply(lambda x : ' '.join(x.split('/')))
train_df['name'] = train_df['name'].apply(lambda x : ' '.join(x.split('|')))
test_df['name'] = test_df['name'].apply(lambda x : ' '.join(x.split('|')))
train_df['word_count'] = train_df['name'].apply(lambda x : len(x.split()))
test_df['word_count'] = test_df['name'].apply(lambda x : len(x.split()))
train_df['name_len'] = train_df['name'].apply(lambda x : len(x))
test_df['name_len'] = test_df['name'].apply(lambda x : len(x))
# 意味ありそうな単語
train_df['is_wifi'] = train_df['name'].apply(lambda x : 'wi-fi' in x.lower() or 'wifi' in x.lower())
test_df['is_wifi'] = test_df['name'].apply(lambda x : 'wi-fi' in x.lower() or 'wifi' in x.lower())
train_df['is_free'] = train_df['name'].apply(lambda x : 'free' in x.lower())
test_df['is_free'] = test_df['name'].apply(lambda x : 'free' in x.lower())
train_df['is_min'] = train_df['name'].apply(lambda x : 'min ' in x.lower())
test_df['is_min'] = test_df['name'].apply(lambda x : 'min ' in x.lower())
train_df['is_skytree'] = train_df['name'].apply(lambda x : 'skytree' in x.lower())
test_df['is_skytree'] = test_df['name'].apply(lambda x : 'skytree' in x.lower())
train_df['is_sale'] = train_df['name'].apply(lambda x : 'sale' in x.lower())
test_df['is_sale'] = test_df['name'].apply(lambda x : 'sale' in x.lower())
train_df['is_star'] = train_df['name'].apply(lambda x : '★' in x.lower() or '☆' in x.lower())
test_df['is_star'] = test_df['name'].apply(lambda x : '★' in x.lower() or '☆' in x.lower())
all_df = pd.concat([train_df, test_df])
tv = TfidfVectorizer(min_df=20)
features = tv.fit_transform(all_df["name"])
train_df = pd.concat([train_df, pd.DataFrame(features.toarray()[:len(train_df)]).add_prefix('TFIDF_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(features.toarray()[len(train_df):]).add_prefix('TFIDF_')], axis=1)
svd = TruncatedSVD(n_components=20, random_state=Config.RANDOM_SATE)
svd_features = svd.fit_transform(features.toarray())
train_df = pd.concat([train_df, pd.DataFrame(svd_features[:len(train_df)]).add_prefix('SVD_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(svd_features[len(train_df):]).add_prefix('SVD_')], axis=1)
tqdm.pandas()
from sklearn.decomposition import NMF
all_df = pd.concat([train_df, test_df])
embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
features = np.stack(all_df["name"].fillna("").progress_apply(lambda x: embedder(x).numpy().reshape(-1)).values)
100%|████████████████████████████████████████████████████████████████████████████| 14986/14986 [04:11<00:00, 59.66it/s]
svd = TruncatedSVD(n_components=50, random_state=Config.RANDOM_SATE)
svd_features = svd.fit_transform(features)
train_df = pd.concat([train_df, pd.DataFrame(svd_features[:len(train_df)]).add_prefix('USE_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(svd_features[len(train_df):]).add_prefix('USE_')], axis=1)
del embedder
cuda.select_device(0)
cuda.close()
keras.backend.clear_session()
class BertSequenceVectorizer:
def __init__(self, model_name="bert-base-uncased", max_len=128):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model_name = model_name
self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
self.bert_model = self.bert_model.to(self.device)
self.max_len = max_len
def vectorize(self, sentence: str) -> np.array:
inp = self.tokenizer.encode(sentence)
len_inp = len(inp)
if len_inp >= self.max_len:
inputs = inp[:self.max_len]
masks = [1] * self.max_len
else:
inputs = inp + [0] * (self.max_len - len_inp)
masks = [1] * len_inp + [0] * (self.max_len - len_inp)
inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
bert_out = self.bert_model(inputs_tensor, masks_tensor)
seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']
if torch.cuda.is_available():
return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
else:
return seq_out[0][0].detach().numpy()
%%time
all_df = pd.concat([train_df, test_df])
BSV = BertSequenceVectorizer(model_name="bert-base-multilingual-uncased", max_len=128)
features = np.stack(all_df["name"].fillna("").map(lambda x: BSV.vectorize(x).reshape(-1)).values)
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight'] - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Wall time: 3min 24s
pca = PCA(n_components=20)
pca_features = pca.fit_transform(features)
train_df = pd.concat([train_df, pd.DataFrame(pca_features[:len(train_df)]).add_prefix('BERT_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(pca_features[len(train_df):]).add_prefix('BERT_')], axis=1)
train_df['last_review_year'] = pd.to_datetime(train_df['last_review']).dt.year
train_df['last_review_month'] = pd.to_datetime(train_df['last_review']).dt.month
test_df['last_review_year'] = pd.to_datetime(test_df['last_review']).dt.year
test_df['last_review_month'] = pd.to_datetime(test_df['last_review']).dt.month
train_df['review_month'] = train_df['number_of_reviews'] / train_df['number_of_reviews']
test_df['review_month'] = test_df['number_of_reviews'] / test_df['number_of_reviews']
# GMM
clf = mixture.GaussianMixture(n_components=10, covariance_type='full')
gm = clf.fit_predict(pd.concat([train_df, test_df])[['latitude','longitude']])
train_df['gmm'] = gm[:len(train_df)]
test_df['gmm'] = gm[len(train_df):]
"""gmm_proba = clf.predict_proba(pd.concat([train_df, test_df])[['latitude','longitude']])
train_df = pd.concat([train_df, pd.DataFrame(gmm_proba[:len(train_df)]).add_prefix('GMM_PROBA_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(gmm_proba[len(train_df):]).add_prefix('GMM_PROBA_')], axis=1)"""
"gmm_proba = clf.predict_proba(pd.concat([train_df, test_df])[['latitude','longitude']])\ntrain_df = pd.concat([train_df, pd.DataFrame(gmm_proba[:len(train_df)]).add_prefix('GMM_PROBA_')], axis=1)\ntest_df = pd.concat([test_df, pd.DataFrame(gmm_proba[len(train_df):]).add_prefix('GMM_PROBA_')], axis=1)"
plt.figure(figsize=(15, 15))
sns.scatterplot(train_df['latitude'], train_df['longitude'], hue=train_df['y_log'])
<AxesSubplot:xlabel='latitude', ylabel='longitude'>
## 重心からの距離
all_df = pd.concat([train_df, test_df])
lat = all_df['latitude'].mean()
long = all_df['longitude'].mean()
centroid = (lat, long)
centroid_dist = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
a = np.array([row['latitude'], row['longitude']])
centroid_dist.append(np.linalg.norm(a - centroid))
train_df['centroid_dist'] = centroid_dist
centroid_dist = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
a = np.array([row['latitude'], row['longitude']])
centroid_dist.append(np.linalg.norm(a - centroid))
test_df['centroid_dist'] = centroid_dist
100%|███████████████████████████████████████████████████████████████████████████| 9990/9990 [00:00<00:00, 14796.65it/s] 100%|███████████████████████████████████████████████████████████████████████████| 4996/4996 [00:00<00:00, 12676.14it/s]
target = 'y_log'
del_columns = ['name', 'y', 'y_log', 'room_type', 'room_type', 'last_review', 'neighbourhood', 'host_id', 'id', 'station_name', 'gmm']
features = list(set(train_df.columns) - set(del_columns))
print(features)
['USE_4', 'TFIDF_504', 'TFIDF_283', 'TFIDF_12', 'USE_1', 'TFIDF_229', 'SVD_1', 'TFIDF_68', 'BERT_0', 'TFIDF_429', 'TFIDF_316', 'TFIDF_277', 'TFIDF_19', 'TFIDF_280', 'TFIDF_447', 'dist_under_005_count', 'TFIDF_245', 'TFIDF_518', 'TFIDF_392', 'BERT_12', 'TFIDF_246', 'BERT_13', 'TFIDF_200', 'TFIDF_210', 'TFIDF_175', 'TFIDF_465', 'TFIDF_127', 'TFIDF_24', 'TFIDF_349', 'USE_20', 'TFIDF_257', 'TFIDF_160', 'TFIDF_219', 'TFIDF_174', 'TFIDF_406', 'TFIDF_516', 'TFIDF_488', 'latitude', 'TFIDF_265', 'USE_14', 'TFIDF_393', 'USE_27', 'TFIDF_44', 'TFIDF_81', 'USE_8', 'SVD_19', 'BERT_15', 'TFIDF_10', 'TFIDF_284', 'USE_41', 'TFIDF_251', 'TFIDF_49', 'USE_43', 'TFIDF_473', 'TFIDF_240', 'TFIDF_168', 'TFIDF_43', 'TFIDF_181', 'TFIDF_119', 'SVD_12', 'TFIDF_163', 'TFIDF_165', 'TFIDF_35', 'TFIDF_92', 'TFIDF_64', 'TFIDF_0', 'TFIDF_338', 'TFIDF_38', 'TFIDF_102', 'TFIDF_39', 'TFIDF_80', 'TFIDF_421', 'TFIDF_228', 'TFIDF_466', 'TFIDF_501', 'TFIDF_86', 'TFIDF_339', 'TFIDF_218', 'TFIDF_404', 'TFIDF_198', 'TFIDF_118', 'TFIDF_153', 'TFIDF_419', 'TFIDF_356', 'USE_38', 'TFIDF_440', 'TFIDF_343', 'TFIDF_484', 'TFIDF_411', 'TFIDF_32', 'USE_19', 'TFIDF_348', 'TFIDF_30', 'USE_46', 'TFIDF_471', 'USE_42', 'USE_33', 'TFIDF_413', 'TFIDF_409', 'TFIDF_111', 'TFIDF_199', 'TFIDF_370', 'TFIDF_117', 'TFIDF_247', 'TFIDF_164', 'TFIDF_264', 'TFIDF_369', 'TFIDF_205', 'TFIDF_363', 'TFIDF_162', 'TFIDF_482', 'USE_34', 'TFIDF_5', 'TFIDF_121', 'TFIDF_298', 'TFIDF_454', 'TFIDF_242', 'USE_48', 'TFIDF_311', 'TFIDF_282', 'TFIDF_241', 'TFIDF_397', 'TFIDF_486', 'TFIDF_511', 'TFIDF_427', 'TFIDF_360', 'TFIDF_137', 'TFIDF_295', 'TFIDF_91', 'TFIDF_45', 'TFIDF_148', 'TFIDF_498', 'TFIDF_250', 'TFIDF_63', 'TFIDF_291', 'TFIDF_191', 'TFIDF_286', 'TFIDF_54', 'TFIDF_314', 'SVD_7', 'TFIDF_461', 'TFIDF_439', 'TFIDF_326', 'TFIDF_459', 'TFIDF_248', 'TFIDF_176', 'TFIDF_154', 'TFIDF_72', 'TFIDF_288', 'TFIDF_336', 'SVD_11', 'TFIDF_455', 'TFIDF_53', 'TFIDF_236', 'TFIDF_350', 'TFIDF_126', 'TFIDF_494', 'TFIDF_243', 'BERT_8', 'TFIDF_462', 'BERT_6', 'BERT_11', 'TFIDF_319', 'minimum_nights', 'TFIDF_161', 'TFIDF_217', 'is_skytree', 'USE_12', 'TFIDF_359', 'TFIDF_268', 'TFIDF_129', 'TFIDF_116', 'TFIDF_249', 'TFIDF_307', 'USE_18', 'TFIDF_436', 'TFIDF_479', 'TFIDF_52', 'TFIDF_100', 'TFIDF_365', 'TFIDF_262', 'BERT_4', 'TFIDF_513', 'TFIDF_76', 'USE_11', 'TFIDF_4', 'TFIDF_524', 'TFIDF_366', 'TFIDF_502', 'TFIDF_211', 'TFIDF_169', 'TFIDF_70', 'TFIDF_353', 'TFIDF_46', 'TFIDF_415', 'BERT_2', 'TFIDF_96', 'TFIDF_170', 'TFIDF_27', 'TFIDF_171', 'TFIDF_254', 'TFIDF_7', 'TFIDF_364', 'TFIDF_3', 'TFIDF_224', 'TFIDF_297', 'TFIDF_444', 'TFIDF_235', 'TFIDF_90', 'TFIDF_394', 'TFIDF_327', 'TFIDF_213', 'TFIDF_214', 'TFIDF_95', 'TFIDF_281', 'TFIDF_259', 'TFIDF_390', 'TFIDF_467', 'TFIDF_189', 'TFIDF_523', 'TFIDF_414', 'TFIDF_58', 'TFIDF_113', 'TFIDF_216', 'TFIDF_8', 'USE_9', 'TFIDF_22', 'TFIDF_396', 'TFIDF_352', 'TFIDF_226', 'BERT_17', 'BERT_5', 'TFIDF_238', 'TFIDF_183', 'BERT_19', 'BERT_7', 'TFIDF_474', 'SVD_3', 'TFIDF_489', 'TFIDF_492', 'TFIDF_74', 'TFIDF_458', 'TFIDF_508', 'TFIDF_204', 'TFIDF_418', 'TFIDF_212', 'TFIDF_434', 'TFIDF_500', 'SVD_8', 'TFIDF_362', 'TFIDF_50', 'TFIDF_313', 'TFIDF_227', 'TFIDF_20', 'TFIDF_472', 'TFIDF_422', 'TFIDF_133', 'TFIDF_21', 'TFIDF_437', 'TFIDF_445', 'TFIDF_14', 'TFIDF_167', 'TFIDF_255', 'TFIDF_158', 'TFIDF_412', 'TFIDF_180', 'TFIDF_194', 'TFIDF_342', 'TFIDF_490', 'TFIDF_517', 'TFIDF_293', 'TFIDF_410', 'is_min', 'TFIDF_304', 'TFIDF_185', 'TFIDF_468', 'TFIDF_41', 'TFIDF_231', 'dist_under_001_count', 'TFIDF_425', 'USE_45', 'TFIDF_157', 'TFIDF_144', 'TFIDF_75', 'TFIDF_193', 'TFIDF_503', 'USE_40', 'TFIDF_428', 'TFIDF_256', 'TFIDF_42', 'BERT_1', 'SVD_15', 'USE_3', 'TFIDF_367', 'TFIDF_519', 'TFIDF_108', 'TFIDF_239', 'TFIDF_452', 'TFIDF_62', 'TFIDF_275', 'TFIDF_186', 'TFIDF_206', 'TFIDF_120', 'TFIDF_388', 'TFIDF_376', 'TFIDF_78', 'TFIDF_505', 'TFIDF_375', 'TFIDF_260', 'TFIDF_442', 'TFIDF_83', 'is_sale', 'TFIDF_51', 'SVD_9', 'TFIDF_521', 'TFIDF_136', 'TFIDF_37', 'TFIDF_82', 'TFIDF_207', 'TFIDF_61', 'TFIDF_485', 'TFIDF_98', 'longitude', 'TFIDF_147', 'TFIDF_292', 'TFIDF_94', 'TFIDF_139', 'USE_22', 'TFIDF_202', 'TFIDF_496', 'BERT_3', 'reviews_per_month', 'TFIDF_497', 'TFIDF_483', 'USE_5', 'TFIDF_187', 'TFIDF_506', 'SVD_16', 'TFIDF_79', 'TFIDF_493', 'TFIDF_177', 'TFIDF_223', 'TFIDF_300', 'TFIDF_478', 'TFIDF_1', 'BERT_16', 'TFIDF_345', 'is_wifi', 'TFIDF_306', 'TFIDF_289', 'review_month', 'TFIDF_317', 'TFIDF_460', 'TFIDF_267', 'TFIDF_273', 'TFIDF_18', 'TFIDF_310', 'TFIDF_522', 'TFIDF_26', 'TFIDF_299', 'USE_25', 'TFIDF_65', 'TFIDF_463', 'TFIDF_77', 'TFIDF_225', 'TFIDF_448', 'centroid_dist', 'SVD_5', 'TFIDF_196', 'TFIDF_303', 'SVD_17', 'TFIDF_232', 'TFIDF_220', 'TFIDF_296', 'TFIDF_449', 'TFIDF_263', 'TFIDF_464', 'TFIDF_272', 'TFIDF_104', 'TFIDF_354', 'TFIDF_182', 'TFIDF_40', 'TFIDF_328', 'BERT_18', 'TFIDF_287', 'TFIDF_309', 'TFIDF_266', 'TFIDF_368', 'TFIDF_374', 'TFIDF_88', 'TFIDF_209', 'TFIDF_337', 'TFIDF_128', 'TFIDF_357', 'TFIDF_384', 'USE_15', 'TFIDF_382', 'TFIDF_457', 'TFIDF_446', 'TFIDF_371', 'TFIDF_112', 'USE_26', 'TFIDF_258', 'TFIDF_308', 'USE_0', 'USE_39', 'TFIDF_134', 'TFIDF_285', 'TFIDF_107', 'SVD_14', 'TFIDF_109', 'TFIDF_203', 'TFIDF_330', 'TFIDF_402', 'TFIDF_244', 'TFIDF_130', 'TFIDF_150', 'TFIDF_201', 'TFIDF_221', 'TFIDF_395', 'TFIDF_387', 'TFIDF_332', 'USE_47', 'TFIDF_344', 'USE_49', 'TFIDF_125', 'TFIDF_347', 'TFIDF_451', 'TFIDF_190', 'TFIDF_234', 'TFIDF_115', 'TFIDF_315', 'TFIDF_443', 'TFIDF_140', 'TFIDF_379', 'TFIDF_340', 'TFIDF_495', 'SVD_2', 'TFIDF_401', 'TFIDF_331', 'TFIDF_416', 'TFIDF_333', 'TFIDF_346', 'TFIDF_33', 'TFIDF_97', 'TFIDF_135', 'TFIDF_290', 'TFIDF_405', 'TFIDF_105', 'TFIDF_253', 'TFIDF_114', 'TFIDF_145', 'TFIDF_301', 'TFIDF_73', 'TFIDF_407', 'TFIDF_93', 'TFIDF_358', 'TFIDF_279', 'TFIDF_514', 'USE_32', 'USE_23', 'TFIDF_274', 'TFIDF_152', 'word_count', 'TFIDF_380', 'USE_31', 'SVD_4', 'TFIDF_23', 'TFIDF_351', 'USE_29', 'TFIDF_433', 'TFIDF_55', 'TFIDF_85', 'TFIDF_391', 'TFIDF_16', 'TFIDF_302', 'TFIDF_420', 'TFIDF_67', 'USE_37', 'TFIDF_323', 'TFIDF_99', 'TFIDF_509', 'TFIDF_29', 'TFIDF_276', 'TFIDF_512', 'TFIDF_132', 'TFIDF_215', 'TFIDF_324', 'last_review_month', 'TFIDF_438', 'TFIDF_470', 'TFIDF_149', 'TFIDF_173', 'TFIDF_355', 'SVD_0', 'USE_13', 'TFIDF_403', 'TFIDF_34', 'TFIDF_383', 'TFIDF_398', 'BERT_14', 'TFIDF_507', 'TFIDF_208', 'TFIDF_110', 'TFIDF_408', 'availability_365', 'TFIDF_318', 'TFIDF_499', 'TFIDF_469', 'USE_2', 'TFIDF_386', 'TFIDF_15', 'TFIDF_9', 'TFIDF_11', 'USE_10', 'TFIDF_57', 'TFIDF_56', 'USE_21', 'TFIDF_233', 'TFIDF_400', 'is_star', 'TFIDF_60', 'TFIDF_271', 'BERT_9', 'TFIDF_69', 'TFIDF_172', 'TFIDF_278', 'USE_6', 'TFIDF_373', 'TFIDF_389', 'TFIDF_261', 'TFIDF_122', 'TFIDF_321', 'TFIDF_325', 'TFIDF_334', 'TFIDF_195', 'TFIDF_31', 'TFIDF_230', 'TFIDF_377', 'TFIDF_84', 'TFIDF_450', 'TFIDF_361', 'TFIDF_477', 'TFIDF_143', 'TFIDF_491', 'SVD_18', 'TFIDF_156', 'TFIDF_89', 'TFIDF_131', 'USE_30', 'TFIDF_124', 'TFIDF_179', 'dist_under_01_count', 'distance', 'TFIDF_166', 'TFIDF_252', 'USE_7', 'TFIDF_378', 'SVD_10', 'TFIDF_66', 'TFIDF_515', 'TFIDF_476', 'TFIDF_417', 'TFIDF_312', 'name_len', 'TFIDF_17', 'TFIDF_294', 'TFIDF_430', 'USE_16', 'TFIDF_155', 'TFIDF_184', 'TFIDF_141', 'TFIDF_103', 'TFIDF_59', 'TFIDF_424', 'TFIDF_441', 'TFIDF_480', 'TFIDF_372', 'TFIDF_475', 'USE_17', 'TFIDF_192', 'is_free', 'TFIDF_178', 'TFIDF_432', 'TFIDF_25', 'TFIDF_481', 'TFIDF_2', 'USE_35', 'TFIDF_48', 'USE_36', 'TFIDF_87', 'TFIDF_28', 'TFIDF_269', 'TFIDF_520', 'USE_24', 'TFIDF_101', 'TFIDF_335', 'TFIDF_381', 'TFIDF_399', 'TFIDF_142', 'TFIDF_487', 'TFIDF_159', 'number_of_reviews', 'last_review_year', 'TFIDF_453', 'TFIDF_188', 'TFIDF_222', 'TFIDF_237', 'TFIDF_151', 'USE_44', 'TFIDF_197', 'TFIDF_47', 'TFIDF_456', 'TFIDF_13', 'BERT_10', 'USE_28', 'TFIDF_270', 'TFIDF_6', 'TFIDF_146', 'TFIDF_341', 'TFIDF_123', 'TFIDF_106', 'TFIDF_385', 'TFIDF_138', 'SVD_13', 'TFIDF_71', 'TFIDF_435', 'SVD_6', 'TFIDF_431', 'TFIDF_510', 'TFIDF_423', 'TFIDF_320', 'TFIDF_36', 'TFIDF_305', 'TFIDF_329', 'TFIDF_322', 'TFIDF_426']
def evaluation(true, pred):
score = np.sqrt(mean_squared_error(true, pred))
return score
# LabelEncording
categoricals = ['room_type', 'neighbourhood', 'station_name', 'gmm']
features = list(set(features) | set(categoricals))
for c in categoricals:
le = LabelEncoder()
le.fit(train_df.append(test_df)[c].astype(str))
train_df[c] = le.transform(train_df[c].astype(str))
test_df[c] = le.transform(test_df[c].astype(str))
# Count encoding
count_enc = ['host_id', 'room_type', 'neighbourhood', 'station_name', 'gmm']
count_features = []
for c in count_enc:
_count = train_df.append(test_df)[c].astype(str).value_counts().to_dict()
train_df[f'{c}_count'] = train_df[c].astype(str).map(_count)
test_df[f'{c}_count'] = test_df[c].astype(str).map(_count)
count_features.append(f'{c}_count')
features = list(set(features) | set(count_features))
params = {
'n_estimators':5000,
'boosting_type': 'gbdt',
'metric': 'regression',
'objective': 'rmse',
'n_jobs': -1,
'seed': Config.RANDOM_SATE,
'learning_rate': 0.01,
}
# TargetEncording col
target_enc = ['neighbourhood', 'station_name', 'gmm']
target_enc_key = ['y_log']
oof_preds = np.zeros(len(train_df))
y_pred = np.zeros(len(test_df))
models = []
cv_scores = {}
skf = StratifiedGroupKFold(n_splits=Config.N_FOLD, random_state=Config.RANDOM_SATE, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(train_df[features], train_df['station_name'], train_df['host_id'])):
print(f'====== fold {fold} ======')
# TrainとTestに分割
x_train, x_val = train_df.copy().iloc[train_index][features], train_df.copy().iloc[test_index][features]
y_train, y_val = train_df.iloc[train_index][target], train_df.iloc[test_index][target]
test = test_df[features]
# Target Encoding
if len(target_enc) > 0:
for t in target_enc_key:
for c in target_enc:
x_train[f'{c}_target_enc_by_{t}'] = train_df.iloc[train_index][c].map(train_df.iloc[test_index].groupby(c)[t].mean().to_dict())
x_val[f'{c}_target_enc_by_{t}'] = train_df.iloc[test_index][c].map(train_df.groupby(c)[t].mean().to_dict())
test[f'{c}_target_enc_by_{t}'] = test_df[c].map(train_df.groupby(c)[t].mean().to_dict())
train_features = x_train.columns.to_list()
# create Dataset
train_set = lgb.Dataset(x_train, y_train, categorical_feature=categoricals, free_raw_data=False)
val_set = lgb.Dataset(x_val, y_val, categorical_feature=categoricals, free_raw_data=False)
# train
model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100, early_stopping_rounds=100)#, feval=rmsle_eval)
models.append(model)
fold_pred = model.predict(x_val)
score = evaluation(y_val, fold_pred)
cv_scores[f'cv{fold}'] = score
oof_preds[test_index] = fold_pred
y_pred += model.predict(test) / Config.N_FOLD
print(f'cv score is {score}')
oof_score = evaluation(train_df[target], oof_preds)
print(f'OOF score is {oof_score}')
====== fold 0 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020747 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 37060 [LightGBM] [Info] Number of data points in the train set: 8084, number of used features: 469 [LightGBM] [Info] Start training from score 9.485144 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.488048 valid_1's l2: 0.698058 [200] training's l2: 0.355449 valid_1's l2: 0.640509 [300] training's l2: 0.286456 valid_1's l2: 0.620083 [400] training's l2: 0.23974 valid_1's l2: 0.607584 [500] training's l2: 0.206211 valid_1's l2: 0.598706 [600] training's l2: 0.180307 valid_1's l2: 0.593431 [700] training's l2: 0.159994 valid_1's l2: 0.589344 [800] training's l2: 0.14347 valid_1's l2: 0.585377 [900] training's l2: 0.129461 valid_1's l2: 0.583878 [1000] training's l2: 0.117605 valid_1's l2: 0.581992 [1100] training's l2: 0.107127 valid_1's l2: 0.579298 [1200] training's l2: 0.098341 valid_1's l2: 0.577533 [1300] training's l2: 0.0905596 valid_1's l2: 0.575875 Early stopping, best iteration is: [1282] training's l2: 0.0918336 valid_1's l2: 0.575756 cv score is 0.7587861164409755 ====== fold 1 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030324 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 37074 [LightGBM] [Info] Number of data points in the train set: 8061, number of used features: 472 [LightGBM] [Info] Start training from score 9.433627 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.490696 valid_1's l2: 0.723275 [200] training's l2: 0.348588 valid_1's l2: 0.677438 [300] training's l2: 0.276958 valid_1's l2: 0.661627 [400] training's l2: 0.23099 valid_1's l2: 0.659778 [500] training's l2: 0.1973 valid_1's l2: 0.657879 [600] training's l2: 0.172069 valid_1's l2: 0.654605 [700] training's l2: 0.152057 valid_1's l2: 0.650254 [800] training's l2: 0.136132 valid_1's l2: 0.647464 [900] training's l2: 0.122567 valid_1's l2: 0.645914 [1000] training's l2: 0.111722 valid_1's l2: 0.645856 Early stopping, best iteration is: [950] training's l2: 0.116976 valid_1's l2: 0.645199 cv score is 0.8032430628385007 ====== fold 2 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020039 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 36694 [LightGBM] [Info] Number of data points in the train set: 7858, number of used features: 458 [LightGBM] [Info] Start training from score 9.480055 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.501974 valid_1's l2: 0.641378 [200] training's l2: 0.358455 valid_1's l2: 0.621378 [300] training's l2: 0.287479 valid_1's l2: 0.609193 [400] training's l2: 0.240488 valid_1's l2: 0.599782 [500] training's l2: 0.206102 valid_1's l2: 0.592426 [600] training's l2: 0.179476 valid_1's l2: 0.586709 [700] training's l2: 0.158619 valid_1's l2: 0.583386 [800] training's l2: 0.141267 valid_1's l2: 0.581166 [900] training's l2: 0.127166 valid_1's l2: 0.579021 [1000] training's l2: 0.115052 valid_1's l2: 0.577815 [1100] training's l2: 0.104588 valid_1's l2: 0.576488 [1200] training's l2: 0.0956801 valid_1's l2: 0.575222 [1300] training's l2: 0.0879042 valid_1's l2: 0.574342 [1400] training's l2: 0.0809795 valid_1's l2: 0.573692 [1500] training's l2: 0.0748301 valid_1's l2: 0.572872 [1600] training's l2: 0.0692713 valid_1's l2: 0.572203 [1700] training's l2: 0.0641214 valid_1's l2: 0.571866 [1800] training's l2: 0.0596875 valid_1's l2: 0.571812 [1900] training's l2: 0.0557035 valid_1's l2: 0.571907 Early stopping, best iteration is: [1843] training's l2: 0.0579282 valid_1's l2: 0.571618 cv score is 0.7560543840213751 ====== fold 3 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029729 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 37088 [LightGBM] [Info] Number of data points in the train set: 8044, number of used features: 463 [LightGBM] [Info] Start training from score 9.460043 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.495351 valid_1's l2: 0.590806 [200] training's l2: 0.357956 valid_1's l2: 0.560963 [300] training's l2: 0.28442 valid_1's l2: 0.557662 [400] training's l2: 0.237224 valid_1's l2: 0.55367 [500] training's l2: 0.20328 valid_1's l2: 0.552406 [600] training's l2: 0.177356 valid_1's l2: 0.548525 [700] training's l2: 0.156841 valid_1's l2: 0.54736 [800] training's l2: 0.140504 valid_1's l2: 0.546186 [900] training's l2: 0.126994 valid_1's l2: 0.54589 [1000] training's l2: 0.115542 valid_1's l2: 0.54398 [1100] training's l2: 0.105512 valid_1's l2: 0.54394 Early stopping, best iteration is: [1073] training's l2: 0.108124 valid_1's l2: 0.54373 cv score is 0.737380513342473 ====== fold 4 ====== [LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029168 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 36683 [LightGBM] [Info] Number of data points in the train set: 7913, number of used features: 469 [LightGBM] [Info] Start training from score 9.456076 Training until validation scores don't improve for 100 rounds [100] training's l2: 0.459676 valid_1's l2: 1.13661 [200] training's l2: 0.335324 valid_1's l2: 1.11005 [300] training's l2: 0.269613 valid_1's l2: 1.09511 [400] training's l2: 0.225178 valid_1's l2: 1.08983 [500] training's l2: 0.192335 valid_1's l2: 1.08412 [600] training's l2: 0.167498 valid_1's l2: 1.0799 [700] training's l2: 0.14797 valid_1's l2: 1.07581 [800] training's l2: 0.132492 valid_1's l2: 1.07327 [900] training's l2: 0.119821 valid_1's l2: 1.07109 Early stopping, best iteration is: [894] training's l2: 0.120568 valid_1's l2: 1.07088 cv score is 1.0348309838938978 OOF score is 0.8276368501192828
"""df_importance = None
for i, model in enumerate(models):
if df_importance is None:
_df = pd.DataFrame([model.feature_importance(importance_type='gain'), train_features]).T
_df.columns = [f'model_{i}_gain', 'feature']
df_importance = _df
else:
_df = pd.DataFrame([model.feature_importance(importance_type='gain'), train_features]).T
_df.columns = [f'model_{i}_gain', 'feature']
df_importance = df_importance.merge(_df, how='outer', on='feature')
df_imp = df_importance
df_imp['mean'] = df_imp[[f'model_{i}_gain' for i in range(len(models))]].mean(axis=1)
order = df_imp.sort_values('mean', ascending=False)['feature'].tolist()
df_imp = pd.melt(df_imp, id_vars=['feature'], value_vars=[f'model_{i}_gain' for i in range(len(models))])
df_imp['value'] = df_imp['value'].astype(float)
fig, ax = plt.subplots(figsize=(len(df_imp['feature'].drop_duplicates()) * .4, 5))
sns.boxenplot(x="feature", y="value", data=df_imp, order=order)
ax.tick_params(axis='x', rotation=90)
ax.set_title('feature importance')
plt.show()"""
'df_importance = None\n\nfor i, model in enumerate(models):\n if df_importance is None:\n _df = pd.DataFrame([model.feature_importance(importance_type=\'gain\'), train_features]).T\n _df.columns = [f\'model_{i}_gain\', \'feature\']\n df_importance = _df\n else:\n _df = pd.DataFrame([model.feature_importance(importance_type=\'gain\'), train_features]).T\n _df.columns = [f\'model_{i}_gain\', \'feature\']\n df_importance = df_importance.merge(_df, how=\'outer\', on=\'feature\')\n\ndf_imp = df_importance\ndf_imp[\'mean\'] = df_imp[[f\'model_{i}_gain\' for i in range(len(models))]].mean(axis=1)\norder = df_imp.sort_values(\'mean\', ascending=False)[\'feature\'].tolist()\n\ndf_imp = pd.melt(df_imp, id_vars=[\'feature\'], value_vars=[f\'model_{i}_gain\' for i in range(len(models))])\ndf_imp[\'value\'] = df_imp[\'value\'].astype(float)\n\nfig, ax = plt.subplots(figsize=(len(df_imp[\'feature\'].drop_duplicates()) * .4, 5))\nsns.boxenplot(x="feature", y="value", data=df_imp, order=order)\nax.tick_params(axis=\'x\', rotation=90)\nax.set_title(\'feature importance\')\nplt.show()'
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes = axes.reshape(-1)
sns.scatterplot(train_df['y_log'], oof_preds, ax=axes[0], alpha=0.5)
sns.distplot(train_df['y_log'], ax=axes[1], label='true')
sns.distplot(oof_preds, ax=axes[1], label='pred')
plt.legend()
<matplotlib.legend.Legend at 0x1fb58f38f40>
test_df['y'] = np.exp(y_pred)
test_df[['id', 'y']].to_csv('../output/sub15.csv', index=False)