LightGBM CV:0.82764/LB:0.81660

import warnings
import os
from pathlib import Path

import pandas as pd
import lightgbm as lgb
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub
from tensorflow import keras

from numba import cuda

import torch

import transformers
from transformers import BertTokenizer

from tqdm import tqdm

warnings.simplefilter('ignore')
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 25)
ROOT_DIR = Path('../')
DATA_DIR = ROOT_DIR / Path('data')
class Config:
    N_FOLD = 5
    RANDOM_SATE = 42
train_df = pd.read_csv(DATA_DIR / Path('train_data.csv'))
test_df = pd.read_csv(DATA_DIR / Path('test_data.csv'))
station_list_df = pd.read_csv(DATA_DIR / Path('station_list.csv'))
submission_df = pd.read_csv(DATA_DIR / Path('submission.csv'))

display(train_df.shape)
display(train_df.head(5))
display(test_df.shape)
display(test_df.head(5))
display(station_list_df.head(5))
(9990, 13)
id name host_id neighbourhood latitude longitude room_type minimum_nights number_of_reviews last_review reviews_per_month availability_365 y
0 1 KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre... 242899459 Koto Ku 35.68185 139.80310 Entire home/apt 1 55 2020-04-25 2.21 173 12008
1 2 Downtown Tokyo Iriya next to Ueno 308879948 Taito Ku 35.72063 139.78536 Entire home/apt 6 72 2020-03-25 2.11 9 6667
2 3 Japan Style,Private,Affordable,4min to Sta. 300877823 Katsushika Ku 35.74723 139.82349 Entire home/apt 1 18 2020-03-23 3.46 288 9923
3 4 4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi 236935461 Shibuya Ku 35.68456 139.68077 Entire home/apt 1 2 2020-04-02 1.76 87 8109
4 5 LICENSED SHINJUKU HOUSE: Heart of the action! 243408889 Shinjuku Ku 35.69840 139.70467 Entire home/apt 1 86 2020-01-30 2.00 156 100390
(4996, 12)
id name host_id neighbourhood latitude longitude room_type minimum_nights number_of_reviews last_review reviews_per_month availability_365
0 1 5-minute walk from Akasaka Sta, Superior double 184730720 Minato Ku 35.67131 139.73285 Private room 1 0 NaN NaN 183
1 2 7 min Sta.-Center of IKEBUKURO Cozy Room#503 20993205 Toshima Ku 35.73014 139.71739 Entire home/apt 2 21 2020-04-16 1.94 337
2 3 Designer'sApt 1min sta☆Shinjuku 7min☆Shibuya 4min 322521715 Setagaya Ku 35.66193 139.66540 Entire home/apt 1 14 2020-02-12 0.82 240
3 4 Komagome Station 2 minutes on foot 234477095 Toshima Ku 35.73603 139.74794 Entire home/apt 1 16 2020-02-17 1.19 0
4 5 Monthly/Metro1min/JR5min/Ueno,Asakusa,Akihabara 145453833 Taito Ku 35.72126 139.78320 Entire home/apt 30 2 2019-07-21 0.19 164
station_name longitude latitude
0 白丸 139.114861 35.811735
1 古里 139.152102 35.816247
2 川井 139.164290 35.813697
3 御嶽 139.182589 35.801468
4 沢井 139.193324 35.805940
train_df['y_log'] = train_df['y'].apply(np.log)
for room_type, room_type_df in train_df.groupby('room_type'):
    sns.distplot(room_type_df['y_log'], label=room_type)
plt.legend()
<matplotlib.legend.Legend at 0x1fabd8a8b50>
station_pos_array = []
for _, station in station_list_df.iterrows():
    b = np.array([station['latitude'], station['longitude']])
    station_pos_array.append((station['station_name'], b))

最寄り駅

nearest = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    a = np.array([row['latitude'], row['longitude']])
    dist_list = []
    for station_name, b in station_pos_array:
        dist = np.linalg.norm(a - b)
        dist_list.append({
            'station_name':station_name,
            'distance':dist
        })
        
    station_dist_all_df = pd.DataFrame(dist_list)
    station_dist_df = station_dist_all_df.sort_values('distance').iloc[0]
    nearest_station_name = station_dist_df['station_name']
    distance = station_dist_df['distance']
    
    dist_under_001_count = (station_dist_all_df['distance'] < 0.01).sum()
    dist_under_005_count = (station_dist_all_df['distance'] < 0.05).sum()
    dist_under_01_count = (station_dist_all_df['distance'] < 0.1).sum()
    
    nearest.append({
        'id':row['id'],
        'station_name': nearest_station_name,
        'distance': distance,
        'dist_under_001_count': dist_under_001_count,
        'dist_under_005_count': dist_under_005_count,
        'dist_under_01_count': dist_under_01_count,
    })
train_df = train_df.merge(pd.DataFrame(nearest), how='left', on='id')
100%|█████████████████████████████████████████████████████████████████████████████| 9990/9990 [00:55<00:00, 181.03it/s]
nearest = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    a = np.array([row['latitude'], row['longitude']])
    dist_list = []
    for station_name, b in station_pos_array:
        dist = np.linalg.norm(a - b)
        dist_list.append({
            'station_name':station_name,
            'distance':dist
        })
        
    station_dist_all_df = pd.DataFrame(dist_list)
    station_dist_df = station_dist_all_df.sort_values('distance').iloc[0]
    nearest_station_name = station_dist_df['station_name']
    distance = station_dist_df['distance']
    
    dist_under_001_count = (station_dist_all_df['distance'] < 0.01).sum()
    dist_under_005_count = (station_dist_all_df['distance'] < 0.05).sum()
    dist_under_01_count = (station_dist_all_df['distance'] < 0.1).sum()
    
    nearest.append({
        'id':row['id'],
        'station_name': nearest_station_name,
        'distance': distance,
        'dist_under_001_count': dist_under_001_count,
        'dist_under_005_count': dist_under_005_count,
        'dist_under_01_count': dist_under_01_count,
    })
test_df = test_df.merge(pd.DataFrame(nearest), how='left', on='id')
100%|█████████████████████████████████████████████████████████████████████████████| 4996/4996 [00:27<00:00, 179.75it/s]

name関連

train_df['name'] = train_df['name'].apply(lambda x : ' '.join(x.split('/')))
test_df['name'] = test_df['name'].apply(lambda x : ' '.join(x.split('/')))
train_df['name'] = train_df['name'].apply(lambda x : ' '.join(x.split('|')))
test_df['name'] = test_df['name'].apply(lambda x : ' '.join(x.split('|')))
train_df['word_count'] = train_df['name'].apply(lambda x : len(x.split()))
test_df['word_count'] = test_df['name'].apply(lambda x : len(x.split()))

train_df['name_len'] = train_df['name'].apply(lambda x : len(x))
test_df['name_len'] = test_df['name'].apply(lambda x : len(x))
# 意味ありそうな単語
train_df['is_wifi'] = train_df['name'].apply(lambda x : 'wi-fi' in x.lower() or 'wifi' in x.lower())
test_df['is_wifi'] = test_df['name'].apply(lambda x : 'wi-fi' in x.lower() or 'wifi' in x.lower())

train_df['is_free'] = train_df['name'].apply(lambda x : 'free' in x.lower())
test_df['is_free'] = test_df['name'].apply(lambda x : 'free' in x.lower())

train_df['is_min'] = train_df['name'].apply(lambda x : 'min ' in x.lower())
test_df['is_min'] = test_df['name'].apply(lambda x : 'min ' in x.lower())

train_df['is_skytree'] = train_df['name'].apply(lambda x : 'skytree' in x.lower())
test_df['is_skytree'] = test_df['name'].apply(lambda x : 'skytree' in x.lower())

train_df['is_sale'] = train_df['name'].apply(lambda x : 'sale' in x.lower())
test_df['is_sale'] = test_df['name'].apply(lambda x : 'sale' in x.lower())

train_df['is_star'] = train_df['name'].apply(lambda x : '★' in x.lower() or '☆' in x.lower())
test_df['is_star'] = test_df['name'].apply(lambda x : '★' in x.lower() or '☆' in x.lower())
TF-IDF
all_df = pd.concat([train_df, test_df])

tv = TfidfVectorizer(min_df=20)
features = tv.fit_transform(all_df["name"])
train_df = pd.concat([train_df, pd.DataFrame(features.toarray()[:len(train_df)]).add_prefix('TFIDF_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(features.toarray()[len(train_df):]).add_prefix('TFIDF_')], axis=1)
svd = TruncatedSVD(n_components=20, random_state=Config.RANDOM_SATE)
svd_features = svd.fit_transform(features.toarray())

train_df = pd.concat([train_df, pd.DataFrame(svd_features[:len(train_df)]).add_prefix('SVD_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(svd_features[len(train_df):]).add_prefix('SVD_')], axis=1)
USE
tqdm.pandas()

from sklearn.decomposition import NMF

all_df = pd.concat([train_df, test_df])

embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

features = np.stack(all_df["name"].fillna("").progress_apply(lambda x: embedder(x).numpy().reshape(-1)).values)
100%|████████████████████████████████████████████████████████████████████████████| 14986/14986 [04:11<00:00, 59.66it/s]
svd = TruncatedSVD(n_components=50, random_state=Config.RANDOM_SATE)
svd_features = svd.fit_transform(features)

train_df = pd.concat([train_df, pd.DataFrame(svd_features[:len(train_df)]).add_prefix('USE_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(svd_features[len(train_df):]).add_prefix('USE_')], axis=1)
del embedder
cuda.select_device(0)
cuda.close()
keras.backend.clear_session()
BERT
class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()
%%time
all_df = pd.concat([train_df, test_df])
BSV = BertSequenceVectorizer(model_name="bert-base-multilingual-uncased", max_len=128)
features = np.stack(all_df["name"].fillna("").map(lambda x: BSV.vectorize(x).reshape(-1)).values)
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Wall time: 3min 24s
pca = PCA(n_components=20)
pca_features = pca.fit_transform(features)
train_df = pd.concat([train_df, pd.DataFrame(pca_features[:len(train_df)]).add_prefix('BERT_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(pca_features[len(train_df):]).add_prefix('BERT_')], axis=1)

review関連

train_df['last_review_year'] = pd.to_datetime(train_df['last_review']).dt.year
train_df['last_review_month'] = pd.to_datetime(train_df['last_review']).dt.month

test_df['last_review_year'] = pd.to_datetime(test_df['last_review']).dt.year
test_df['last_review_month'] = pd.to_datetime(test_df['last_review']).dt.month

train_df['review_month'] = train_df['number_of_reviews'] / train_df['number_of_reviews'] 
test_df['review_month'] = test_df['number_of_reviews'] / test_df['number_of_reviews'] 

座標系

# GMM
clf = mixture.GaussianMixture(n_components=10, covariance_type='full')
gm = clf.fit_predict(pd.concat([train_df, test_df])[['latitude','longitude']])

train_df['gmm'] = gm[:len(train_df)]
test_df['gmm'] = gm[len(train_df):]

"""gmm_proba = clf.predict_proba(pd.concat([train_df, test_df])[['latitude','longitude']])
train_df = pd.concat([train_df, pd.DataFrame(gmm_proba[:len(train_df)]).add_prefix('GMM_PROBA_')], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(gmm_proba[len(train_df):]).add_prefix('GMM_PROBA_')], axis=1)"""
"gmm_proba = clf.predict_proba(pd.concat([train_df, test_df])[['latitude','longitude']])\ntrain_df = pd.concat([train_df, pd.DataFrame(gmm_proba[:len(train_df)]).add_prefix('GMM_PROBA_')], axis=1)\ntest_df = pd.concat([test_df, pd.DataFrame(gmm_proba[len(train_df):]).add_prefix('GMM_PROBA_')], axis=1)"
plt.figure(figsize=(15, 15))
sns.scatterplot(train_df['latitude'], train_df['longitude'], hue=train_df['y_log'])
<AxesSubplot:xlabel='latitude', ylabel='longitude'>
## 重心からの距離
all_df = pd.concat([train_df, test_df])
lat = all_df['latitude'].mean()
long = all_df['longitude'].mean()

centroid = (lat, long)

centroid_dist = []
for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    a = np.array([row['latitude'], row['longitude']])
    centroid_dist.append(np.linalg.norm(a - centroid))

train_df['centroid_dist'] = centroid_dist

centroid_dist = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    a = np.array([row['latitude'], row['longitude']])
    centroid_dist.append(np.linalg.norm(a - centroid))

test_df['centroid_dist'] = centroid_dist
100%|███████████████████████████████████████████████████████████████████████████| 9990/9990 [00:00<00:00, 14796.65it/s]
100%|███████████████████████████████████████████████████████████████████████████| 4996/4996 [00:00<00:00, 12676.14it/s]

feature and target

target = 'y_log'
del_columns = ['name', 'y', 'y_log', 'room_type', 'room_type', 'last_review', 'neighbourhood', 'host_id', 'id', 'station_name', 'gmm']

features = list(set(train_df.columns) - set(del_columns))
print(features)
['USE_4', 'TFIDF_504', 'TFIDF_283', 'TFIDF_12', 'USE_1', 'TFIDF_229', 'SVD_1', 'TFIDF_68', 'BERT_0', 'TFIDF_429', 'TFIDF_316', 'TFIDF_277', 'TFIDF_19', 'TFIDF_280', 'TFIDF_447', 'dist_under_005_count', 'TFIDF_245', 'TFIDF_518', 'TFIDF_392', 'BERT_12', 'TFIDF_246', 'BERT_13', 'TFIDF_200', 'TFIDF_210', 'TFIDF_175', 'TFIDF_465', 'TFIDF_127', 'TFIDF_24', 'TFIDF_349', 'USE_20', 'TFIDF_257', 'TFIDF_160', 'TFIDF_219', 'TFIDF_174', 'TFIDF_406', 'TFIDF_516', 'TFIDF_488', 'latitude', 'TFIDF_265', 'USE_14', 'TFIDF_393', 'USE_27', 'TFIDF_44', 'TFIDF_81', 'USE_8', 'SVD_19', 'BERT_15', 'TFIDF_10', 'TFIDF_284', 'USE_41', 'TFIDF_251', 'TFIDF_49', 'USE_43', 'TFIDF_473', 'TFIDF_240', 'TFIDF_168', 'TFIDF_43', 'TFIDF_181', 'TFIDF_119', 'SVD_12', 'TFIDF_163', 'TFIDF_165', 'TFIDF_35', 'TFIDF_92', 'TFIDF_64', 'TFIDF_0', 'TFIDF_338', 'TFIDF_38', 'TFIDF_102', 'TFIDF_39', 'TFIDF_80', 'TFIDF_421', 'TFIDF_228', 'TFIDF_466', 'TFIDF_501', 'TFIDF_86', 'TFIDF_339', 'TFIDF_218', 'TFIDF_404', 'TFIDF_198', 'TFIDF_118', 'TFIDF_153', 'TFIDF_419', 'TFIDF_356', 'USE_38', 'TFIDF_440', 'TFIDF_343', 'TFIDF_484', 'TFIDF_411', 'TFIDF_32', 'USE_19', 'TFIDF_348', 'TFIDF_30', 'USE_46', 'TFIDF_471', 'USE_42', 'USE_33', 'TFIDF_413', 'TFIDF_409', 'TFIDF_111', 'TFIDF_199', 'TFIDF_370', 'TFIDF_117', 'TFIDF_247', 'TFIDF_164', 'TFIDF_264', 'TFIDF_369', 'TFIDF_205', 'TFIDF_363', 'TFIDF_162', 'TFIDF_482', 'USE_34', 'TFIDF_5', 'TFIDF_121', 'TFIDF_298', 'TFIDF_454', 'TFIDF_242', 'USE_48', 'TFIDF_311', 'TFIDF_282', 'TFIDF_241', 'TFIDF_397', 'TFIDF_486', 'TFIDF_511', 'TFIDF_427', 'TFIDF_360', 'TFIDF_137', 'TFIDF_295', 'TFIDF_91', 'TFIDF_45', 'TFIDF_148', 'TFIDF_498', 'TFIDF_250', 'TFIDF_63', 'TFIDF_291', 'TFIDF_191', 'TFIDF_286', 'TFIDF_54', 'TFIDF_314', 'SVD_7', 'TFIDF_461', 'TFIDF_439', 'TFIDF_326', 'TFIDF_459', 'TFIDF_248', 'TFIDF_176', 'TFIDF_154', 'TFIDF_72', 'TFIDF_288', 'TFIDF_336', 'SVD_11', 'TFIDF_455', 'TFIDF_53', 'TFIDF_236', 'TFIDF_350', 'TFIDF_126', 'TFIDF_494', 'TFIDF_243', 'BERT_8', 'TFIDF_462', 'BERT_6', 'BERT_11', 'TFIDF_319', 'minimum_nights', 'TFIDF_161', 'TFIDF_217', 'is_skytree', 'USE_12', 'TFIDF_359', 'TFIDF_268', 'TFIDF_129', 'TFIDF_116', 'TFIDF_249', 'TFIDF_307', 'USE_18', 'TFIDF_436', 'TFIDF_479', 'TFIDF_52', 'TFIDF_100', 'TFIDF_365', 'TFIDF_262', 'BERT_4', 'TFIDF_513', 'TFIDF_76', 'USE_11', 'TFIDF_4', 'TFIDF_524', 'TFIDF_366', 'TFIDF_502', 'TFIDF_211', 'TFIDF_169', 'TFIDF_70', 'TFIDF_353', 'TFIDF_46', 'TFIDF_415', 'BERT_2', 'TFIDF_96', 'TFIDF_170', 'TFIDF_27', 'TFIDF_171', 'TFIDF_254', 'TFIDF_7', 'TFIDF_364', 'TFIDF_3', 'TFIDF_224', 'TFIDF_297', 'TFIDF_444', 'TFIDF_235', 'TFIDF_90', 'TFIDF_394', 'TFIDF_327', 'TFIDF_213', 'TFIDF_214', 'TFIDF_95', 'TFIDF_281', 'TFIDF_259', 'TFIDF_390', 'TFIDF_467', 'TFIDF_189', 'TFIDF_523', 'TFIDF_414', 'TFIDF_58', 'TFIDF_113', 'TFIDF_216', 'TFIDF_8', 'USE_9', 'TFIDF_22', 'TFIDF_396', 'TFIDF_352', 'TFIDF_226', 'BERT_17', 'BERT_5', 'TFIDF_238', 'TFIDF_183', 'BERT_19', 'BERT_7', 'TFIDF_474', 'SVD_3', 'TFIDF_489', 'TFIDF_492', 'TFIDF_74', 'TFIDF_458', 'TFIDF_508', 'TFIDF_204', 'TFIDF_418', 'TFIDF_212', 'TFIDF_434', 'TFIDF_500', 'SVD_8', 'TFIDF_362', 'TFIDF_50', 'TFIDF_313', 'TFIDF_227', 'TFIDF_20', 'TFIDF_472', 'TFIDF_422', 'TFIDF_133', 'TFIDF_21', 'TFIDF_437', 'TFIDF_445', 'TFIDF_14', 'TFIDF_167', 'TFIDF_255', 'TFIDF_158', 'TFIDF_412', 'TFIDF_180', 'TFIDF_194', 'TFIDF_342', 'TFIDF_490', 'TFIDF_517', 'TFIDF_293', 'TFIDF_410', 'is_min', 'TFIDF_304', 'TFIDF_185', 'TFIDF_468', 'TFIDF_41', 'TFIDF_231', 'dist_under_001_count', 'TFIDF_425', 'USE_45', 'TFIDF_157', 'TFIDF_144', 'TFIDF_75', 'TFIDF_193', 'TFIDF_503', 'USE_40', 'TFIDF_428', 'TFIDF_256', 'TFIDF_42', 'BERT_1', 'SVD_15', 'USE_3', 'TFIDF_367', 'TFIDF_519', 'TFIDF_108', 'TFIDF_239', 'TFIDF_452', 'TFIDF_62', 'TFIDF_275', 'TFIDF_186', 'TFIDF_206', 'TFIDF_120', 'TFIDF_388', 'TFIDF_376', 'TFIDF_78', 'TFIDF_505', 'TFIDF_375', 'TFIDF_260', 'TFIDF_442', 'TFIDF_83', 'is_sale', 'TFIDF_51', 'SVD_9', 'TFIDF_521', 'TFIDF_136', 'TFIDF_37', 'TFIDF_82', 'TFIDF_207', 'TFIDF_61', 'TFIDF_485', 'TFIDF_98', 'longitude', 'TFIDF_147', 'TFIDF_292', 'TFIDF_94', 'TFIDF_139', 'USE_22', 'TFIDF_202', 'TFIDF_496', 'BERT_3', 'reviews_per_month', 'TFIDF_497', 'TFIDF_483', 'USE_5', 'TFIDF_187', 'TFIDF_506', 'SVD_16', 'TFIDF_79', 'TFIDF_493', 'TFIDF_177', 'TFIDF_223', 'TFIDF_300', 'TFIDF_478', 'TFIDF_1', 'BERT_16', 'TFIDF_345', 'is_wifi', 'TFIDF_306', 'TFIDF_289', 'review_month', 'TFIDF_317', 'TFIDF_460', 'TFIDF_267', 'TFIDF_273', 'TFIDF_18', 'TFIDF_310', 'TFIDF_522', 'TFIDF_26', 'TFIDF_299', 'USE_25', 'TFIDF_65', 'TFIDF_463', 'TFIDF_77', 'TFIDF_225', 'TFIDF_448', 'centroid_dist', 'SVD_5', 'TFIDF_196', 'TFIDF_303', 'SVD_17', 'TFIDF_232', 'TFIDF_220', 'TFIDF_296', 'TFIDF_449', 'TFIDF_263', 'TFIDF_464', 'TFIDF_272', 'TFIDF_104', 'TFIDF_354', 'TFIDF_182', 'TFIDF_40', 'TFIDF_328', 'BERT_18', 'TFIDF_287', 'TFIDF_309', 'TFIDF_266', 'TFIDF_368', 'TFIDF_374', 'TFIDF_88', 'TFIDF_209', 'TFIDF_337', 'TFIDF_128', 'TFIDF_357', 'TFIDF_384', 'USE_15', 'TFIDF_382', 'TFIDF_457', 'TFIDF_446', 'TFIDF_371', 'TFIDF_112', 'USE_26', 'TFIDF_258', 'TFIDF_308', 'USE_0', 'USE_39', 'TFIDF_134', 'TFIDF_285', 'TFIDF_107', 'SVD_14', 'TFIDF_109', 'TFIDF_203', 'TFIDF_330', 'TFIDF_402', 'TFIDF_244', 'TFIDF_130', 'TFIDF_150', 'TFIDF_201', 'TFIDF_221', 'TFIDF_395', 'TFIDF_387', 'TFIDF_332', 'USE_47', 'TFIDF_344', 'USE_49', 'TFIDF_125', 'TFIDF_347', 'TFIDF_451', 'TFIDF_190', 'TFIDF_234', 'TFIDF_115', 'TFIDF_315', 'TFIDF_443', 'TFIDF_140', 'TFIDF_379', 'TFIDF_340', 'TFIDF_495', 'SVD_2', 'TFIDF_401', 'TFIDF_331', 'TFIDF_416', 'TFIDF_333', 'TFIDF_346', 'TFIDF_33', 'TFIDF_97', 'TFIDF_135', 'TFIDF_290', 'TFIDF_405', 'TFIDF_105', 'TFIDF_253', 'TFIDF_114', 'TFIDF_145', 'TFIDF_301', 'TFIDF_73', 'TFIDF_407', 'TFIDF_93', 'TFIDF_358', 'TFIDF_279', 'TFIDF_514', 'USE_32', 'USE_23', 'TFIDF_274', 'TFIDF_152', 'word_count', 'TFIDF_380', 'USE_31', 'SVD_4', 'TFIDF_23', 'TFIDF_351', 'USE_29', 'TFIDF_433', 'TFIDF_55', 'TFIDF_85', 'TFIDF_391', 'TFIDF_16', 'TFIDF_302', 'TFIDF_420', 'TFIDF_67', 'USE_37', 'TFIDF_323', 'TFIDF_99', 'TFIDF_509', 'TFIDF_29', 'TFIDF_276', 'TFIDF_512', 'TFIDF_132', 'TFIDF_215', 'TFIDF_324', 'last_review_month', 'TFIDF_438', 'TFIDF_470', 'TFIDF_149', 'TFIDF_173', 'TFIDF_355', 'SVD_0', 'USE_13', 'TFIDF_403', 'TFIDF_34', 'TFIDF_383', 'TFIDF_398', 'BERT_14', 'TFIDF_507', 'TFIDF_208', 'TFIDF_110', 'TFIDF_408', 'availability_365', 'TFIDF_318', 'TFIDF_499', 'TFIDF_469', 'USE_2', 'TFIDF_386', 'TFIDF_15', 'TFIDF_9', 'TFIDF_11', 'USE_10', 'TFIDF_57', 'TFIDF_56', 'USE_21', 'TFIDF_233', 'TFIDF_400', 'is_star', 'TFIDF_60', 'TFIDF_271', 'BERT_9', 'TFIDF_69', 'TFIDF_172', 'TFIDF_278', 'USE_6', 'TFIDF_373', 'TFIDF_389', 'TFIDF_261', 'TFIDF_122', 'TFIDF_321', 'TFIDF_325', 'TFIDF_334', 'TFIDF_195', 'TFIDF_31', 'TFIDF_230', 'TFIDF_377', 'TFIDF_84', 'TFIDF_450', 'TFIDF_361', 'TFIDF_477', 'TFIDF_143', 'TFIDF_491', 'SVD_18', 'TFIDF_156', 'TFIDF_89', 'TFIDF_131', 'USE_30', 'TFIDF_124', 'TFIDF_179', 'dist_under_01_count', 'distance', 'TFIDF_166', 'TFIDF_252', 'USE_7', 'TFIDF_378', 'SVD_10', 'TFIDF_66', 'TFIDF_515', 'TFIDF_476', 'TFIDF_417', 'TFIDF_312', 'name_len', 'TFIDF_17', 'TFIDF_294', 'TFIDF_430', 'USE_16', 'TFIDF_155', 'TFIDF_184', 'TFIDF_141', 'TFIDF_103', 'TFIDF_59', 'TFIDF_424', 'TFIDF_441', 'TFIDF_480', 'TFIDF_372', 'TFIDF_475', 'USE_17', 'TFIDF_192', 'is_free', 'TFIDF_178', 'TFIDF_432', 'TFIDF_25', 'TFIDF_481', 'TFIDF_2', 'USE_35', 'TFIDF_48', 'USE_36', 'TFIDF_87', 'TFIDF_28', 'TFIDF_269', 'TFIDF_520', 'USE_24', 'TFIDF_101', 'TFIDF_335', 'TFIDF_381', 'TFIDF_399', 'TFIDF_142', 'TFIDF_487', 'TFIDF_159', 'number_of_reviews', 'last_review_year', 'TFIDF_453', 'TFIDF_188', 'TFIDF_222', 'TFIDF_237', 'TFIDF_151', 'USE_44', 'TFIDF_197', 'TFIDF_47', 'TFIDF_456', 'TFIDF_13', 'BERT_10', 'USE_28', 'TFIDF_270', 'TFIDF_6', 'TFIDF_146', 'TFIDF_341', 'TFIDF_123', 'TFIDF_106', 'TFIDF_385', 'TFIDF_138', 'SVD_13', 'TFIDF_71', 'TFIDF_435', 'SVD_6', 'TFIDF_431', 'TFIDF_510', 'TFIDF_423', 'TFIDF_320', 'TFIDF_36', 'TFIDF_305', 'TFIDF_329', 'TFIDF_322', 'TFIDF_426']

train lgbm

def evaluation(true, pred):
    score = np.sqrt(mean_squared_error(true, pred))
    return score
encording
# LabelEncording
categoricals = ['room_type', 'neighbourhood', 'station_name', 'gmm']
features = list(set(features) | set(categoricals))
for c in categoricals:
    le = LabelEncoder()
    le.fit(train_df.append(test_df)[c].astype(str))
    train_df[c] = le.transform(train_df[c].astype(str))
    test_df[c] = le.transform(test_df[c].astype(str))

# Count encoding
count_enc = ['host_id', 'room_type', 'neighbourhood', 'station_name', 'gmm']
count_features = []
for c in count_enc:
    _count = train_df.append(test_df)[c].astype(str).value_counts().to_dict()
    train_df[f'{c}_count'] = train_df[c].astype(str).map(_count)
    test_df[f'{c}_count'] = test_df[c].astype(str).map(_count)
    count_features.append(f'{c}_count')
features = list(set(features) | set(count_features))
train
params = {
    'n_estimators':5000,
    'boosting_type': 'gbdt',
    'metric': 'regression',
    'objective': 'rmse',
    'n_jobs': -1,
    'seed': Config.RANDOM_SATE,
    'learning_rate': 0.01,
}

# TargetEncording col
target_enc = ['neighbourhood', 'station_name', 'gmm']
target_enc_key = ['y_log']

oof_preds = np.zeros(len(train_df))
y_pred = np.zeros(len(test_df))
models = []
cv_scores = {}
skf = StratifiedGroupKFold(n_splits=Config.N_FOLD, random_state=Config.RANDOM_SATE, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(train_df[features], train_df['station_name'], train_df['host_id'])):

    print(f'====== fold {fold} ======')

    # TrainとTestに分割
    x_train, x_val = train_df.copy().iloc[train_index][features], train_df.copy().iloc[test_index][features]
    y_train, y_val =  train_df.iloc[train_index][target], train_df.iloc[test_index][target]

    test = test_df[features]

    # Target Encoding
    if len(target_enc) > 0:
        for t in target_enc_key:
            for c in target_enc:
                x_train[f'{c}_target_enc_by_{t}'] = train_df.iloc[train_index][c].map(train_df.iloc[test_index].groupby(c)[t].mean().to_dict())
                x_val[f'{c}_target_enc_by_{t}'] = train_df.iloc[test_index][c].map(train_df.groupby(c)[t].mean().to_dict())
                test[f'{c}_target_enc_by_{t}'] = test_df[c].map(train_df.groupby(c)[t].mean().to_dict())
            
    train_features = x_train.columns.to_list()

    # create Dataset
    train_set = lgb.Dataset(x_train, y_train, categorical_feature=categoricals, free_raw_data=False)
    val_set = lgb.Dataset(x_val, y_val, categorical_feature=categoricals, free_raw_data=False)

    # train
    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100, early_stopping_rounds=100)#, feval=rmsle_eval)
    
    models.append(model)

    fold_pred = model.predict(x_val)

    score = evaluation(y_val, fold_pred)
    cv_scores[f'cv{fold}'] = score

    oof_preds[test_index] = fold_pred

    y_pred += model.predict(test) / Config.N_FOLD

    print(f'cv score is {score}')

oof_score = evaluation(train_df[target], oof_preds)
print(f'OOF score is {oof_score}')
====== fold 0 ======
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37060
[LightGBM] [Info] Number of data points in the train set: 8084, number of used features: 469
[LightGBM] [Info] Start training from score 9.485144
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.488048	valid_1's l2: 0.698058
[200]	training's l2: 0.355449	valid_1's l2: 0.640509
[300]	training's l2: 0.286456	valid_1's l2: 0.620083
[400]	training's l2: 0.23974	valid_1's l2: 0.607584
[500]	training's l2: 0.206211	valid_1's l2: 0.598706
[600]	training's l2: 0.180307	valid_1's l2: 0.593431
[700]	training's l2: 0.159994	valid_1's l2: 0.589344
[800]	training's l2: 0.14347	valid_1's l2: 0.585377
[900]	training's l2: 0.129461	valid_1's l2: 0.583878
[1000]	training's l2: 0.117605	valid_1's l2: 0.581992
[1100]	training's l2: 0.107127	valid_1's l2: 0.579298
[1200]	training's l2: 0.098341	valid_1's l2: 0.577533
[1300]	training's l2: 0.0905596	valid_1's l2: 0.575875
Early stopping, best iteration is:
[1282]	training's l2: 0.0918336	valid_1's l2: 0.575756
cv score is 0.7587861164409755
====== fold 1 ======
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37074
[LightGBM] [Info] Number of data points in the train set: 8061, number of used features: 472
[LightGBM] [Info] Start training from score 9.433627
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.490696	valid_1's l2: 0.723275
[200]	training's l2: 0.348588	valid_1's l2: 0.677438
[300]	training's l2: 0.276958	valid_1's l2: 0.661627
[400]	training's l2: 0.23099	valid_1's l2: 0.659778
[500]	training's l2: 0.1973	valid_1's l2: 0.657879
[600]	training's l2: 0.172069	valid_1's l2: 0.654605
[700]	training's l2: 0.152057	valid_1's l2: 0.650254
[800]	training's l2: 0.136132	valid_1's l2: 0.647464
[900]	training's l2: 0.122567	valid_1's l2: 0.645914
[1000]	training's l2: 0.111722	valid_1's l2: 0.645856
Early stopping, best iteration is:
[950]	training's l2: 0.116976	valid_1's l2: 0.645199
cv score is 0.8032430628385007
====== fold 2 ======
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36694
[LightGBM] [Info] Number of data points in the train set: 7858, number of used features: 458
[LightGBM] [Info] Start training from score 9.480055
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.501974	valid_1's l2: 0.641378
[200]	training's l2: 0.358455	valid_1's l2: 0.621378
[300]	training's l2: 0.287479	valid_1's l2: 0.609193
[400]	training's l2: 0.240488	valid_1's l2: 0.599782
[500]	training's l2: 0.206102	valid_1's l2: 0.592426
[600]	training's l2: 0.179476	valid_1's l2: 0.586709
[700]	training's l2: 0.158619	valid_1's l2: 0.583386
[800]	training's l2: 0.141267	valid_1's l2: 0.581166
[900]	training's l2: 0.127166	valid_1's l2: 0.579021
[1000]	training's l2: 0.115052	valid_1's l2: 0.577815
[1100]	training's l2: 0.104588	valid_1's l2: 0.576488
[1200]	training's l2: 0.0956801	valid_1's l2: 0.575222
[1300]	training's l2: 0.0879042	valid_1's l2: 0.574342
[1400]	training's l2: 0.0809795	valid_1's l2: 0.573692
[1500]	training's l2: 0.0748301	valid_1's l2: 0.572872
[1600]	training's l2: 0.0692713	valid_1's l2: 0.572203
[1700]	training's l2: 0.0641214	valid_1's l2: 0.571866
[1800]	training's l2: 0.0596875	valid_1's l2: 0.571812
[1900]	training's l2: 0.0557035	valid_1's l2: 0.571907
Early stopping, best iteration is:
[1843]	training's l2: 0.0579282	valid_1's l2: 0.571618
cv score is 0.7560543840213751
====== fold 3 ======
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029729 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37088
[LightGBM] [Info] Number of data points in the train set: 8044, number of used features: 463
[LightGBM] [Info] Start training from score 9.460043
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.495351	valid_1's l2: 0.590806
[200]	training's l2: 0.357956	valid_1's l2: 0.560963
[300]	training's l2: 0.28442	valid_1's l2: 0.557662
[400]	training's l2: 0.237224	valid_1's l2: 0.55367
[500]	training's l2: 0.20328	valid_1's l2: 0.552406
[600]	training's l2: 0.177356	valid_1's l2: 0.548525
[700]	training's l2: 0.156841	valid_1's l2: 0.54736
[800]	training's l2: 0.140504	valid_1's l2: 0.546186
[900]	training's l2: 0.126994	valid_1's l2: 0.54589
[1000]	training's l2: 0.115542	valid_1's l2: 0.54398
[1100]	training's l2: 0.105512	valid_1's l2: 0.54394
Early stopping, best iteration is:
[1073]	training's l2: 0.108124	valid_1's l2: 0.54373
cv score is 0.737380513342473
====== fold 4 ======
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 36683
[LightGBM] [Info] Number of data points in the train set: 7913, number of used features: 469
[LightGBM] [Info] Start training from score 9.456076
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.459676	valid_1's l2: 1.13661
[200]	training's l2: 0.335324	valid_1's l2: 1.11005
[300]	training's l2: 0.269613	valid_1's l2: 1.09511
[400]	training's l2: 0.225178	valid_1's l2: 1.08983
[500]	training's l2: 0.192335	valid_1's l2: 1.08412
[600]	training's l2: 0.167498	valid_1's l2: 1.0799
[700]	training's l2: 0.14797	valid_1's l2: 1.07581
[800]	training's l2: 0.132492	valid_1's l2: 1.07327
[900]	training's l2: 0.119821	valid_1's l2: 1.07109
Early stopping, best iteration is:
[894]	training's l2: 0.120568	valid_1's l2: 1.07088
cv score is 1.0348309838938978
OOF score is 0.8276368501192828
"""df_importance = None

for i, model in enumerate(models):
    if df_importance is None:
        _df = pd.DataFrame([model.feature_importance(importance_type='gain'), train_features]).T
        _df.columns = [f'model_{i}_gain', 'feature']
        df_importance = _df
    else:
        _df = pd.DataFrame([model.feature_importance(importance_type='gain'), train_features]).T
        _df.columns = [f'model_{i}_gain', 'feature']
        df_importance = df_importance.merge(_df, how='outer', on='feature')

df_imp = df_importance
df_imp['mean'] = df_imp[[f'model_{i}_gain' for i in range(len(models))]].mean(axis=1)
order = df_imp.sort_values('mean', ascending=False)['feature'].tolist()

df_imp = pd.melt(df_imp, id_vars=['feature'], value_vars=[f'model_{i}_gain' for i in range(len(models))])
df_imp['value'] = df_imp['value'].astype(float)

fig, ax = plt.subplots(figsize=(len(df_imp['feature'].drop_duplicates()) * .4, 5))
sns.boxenplot(x="feature", y="value", data=df_imp, order=order)
ax.tick_params(axis='x', rotation=90)
ax.set_title('feature importance')
plt.show()"""
'df_importance = None\n\nfor i, model in enumerate(models):\n    if df_importance is None:\n        _df = pd.DataFrame([model.feature_importance(importance_type=\'gain\'), train_features]).T\n        _df.columns = [f\'model_{i}_gain\', \'feature\']\n        df_importance = _df\n    else:\n        _df = pd.DataFrame([model.feature_importance(importance_type=\'gain\'), train_features]).T\n        _df.columns = [f\'model_{i}_gain\', \'feature\']\n        df_importance = df_importance.merge(_df, how=\'outer\', on=\'feature\')\n\ndf_imp = df_importance\ndf_imp[\'mean\'] = df_imp[[f\'model_{i}_gain\' for i in range(len(models))]].mean(axis=1)\norder = df_imp.sort_values(\'mean\', ascending=False)[\'feature\'].tolist()\n\ndf_imp = pd.melt(df_imp, id_vars=[\'feature\'], value_vars=[f\'model_{i}_gain\' for i in range(len(models))])\ndf_imp[\'value\'] = df_imp[\'value\'].astype(float)\n\nfig, ax = plt.subplots(figsize=(len(df_imp[\'feature\'].drop_duplicates()) * .4, 5))\nsns.boxenplot(x="feature", y="value", data=df_imp, order=order)\nax.tick_params(axis=\'x\', rotation=90)\nax.set_title(\'feature importance\')\nplt.show()'
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes = axes.reshape(-1)
sns.scatterplot(train_df['y_log'], oof_preds, ax=axes[0], alpha=0.5)
sns.distplot(train_df['y_log'], ax=axes[1], label='true')
sns.distplot(oof_preds, ax=axes[1], label='pred')
plt.legend()
<matplotlib.legend.Legend at 0x1fb58f38f40>
test_df['y'] = np.exp(y_pred)
test_df[['id', 'y']].to_csv('../output/sub15.csv', index=False)

添付データ

  • exp015.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241008T083648Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。