一泊の適正価格はいくら?
yshr10ic
train_data.csv
test_data.csv
!pip install matplotlib-venn > /dev/null
import datetime import matplotlib.pyplot as plt from matplotlib_venn import venn2 import numpy as np import os import pandas as pd import seaborn as sns pd.options.display.float_format = '{:.2f}'.format import warnings warnings.simplefilter('ignore')
data_dir = '../data/datasets'
train_df = pd.read_csv(os.path.join(data_dir, 'train_data.csv')) test_df = pd.read_csv(os.path.join(data_dir, 'test_data.csv')) station_df = pd.read_csv(os.path.join(data_dir, 'station_list.csv')) submit_df = pd.read_csv(os.path.join(data_dir, 'submission.csv'))
print(f'{train_df.shape=}') print(f'{test_df.shape=}') print(f'{station_df.shape=}') print(f'{submit_df.shape=}')
train_df.shape=(9990, 13) test_df.shape=(4996, 12) station_df.shape=(203, 3) submit_df.shape=(4996, 2)
display(train_df.head())
display(train_df.describe())
display(train_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9990 entries, 0 to 9989 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 9990 non-null int64 1 name 9990 non-null object 2 host_id 9990 non-null int64 3 neighbourhood 9990 non-null object 4 latitude 9990 non-null float64 5 longitude 9990 non-null float64 6 room_type 9990 non-null object 7 minimum_nights 9990 non-null int64 8 number_of_reviews 9990 non-null int64 9 last_review 8291 non-null object 10 reviews_per_month 8291 non-null float64 11 availability_365 9990 non-null int64 12 y 9990 non-null int64 dtypes: float64(3), int64(6), object(4) memory usage: 1014.7+ KB
None
display(test_df.head())
display(test_df.describe())
display(test_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4996 entries, 0 to 4995 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 4996 non-null int64 1 name 4996 non-null object 2 host_id 4996 non-null int64 3 neighbourhood 4996 non-null object 4 latitude 4996 non-null float64 5 longitude 4996 non-null float64 6 room_type 4996 non-null object 7 minimum_nights 4996 non-null int64 8 number_of_reviews 4996 non-null int64 9 last_review 4164 non-null object 10 reviews_per_month 4164 non-null float64 11 availability_365 4996 non-null int64 dtypes: float64(3), int64(5), object(4) memory usage: 468.5+ KB
display(station_df.head())
display(station_df.describe())
display(station_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 203 entries, 0 to 202 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 station_name 203 non-null object 1 longitude 203 non-null float64 2 latitude 203 non-null float64 dtypes: float64(2), object(1) memory usage: 4.9+ KB
last_review
reviews_per_month
display( pd.DataFrame({ 'train': train_df.isnull().sum(), 'test': test_df.isnull().sum() }).reset_index() )
print(f"[tr] {train_df.duplicated().sum()}") print(f"[te] {test_df.duplicated().sum()}")
[tr] 0 [te] 0
all_df = pd.concat([train_df, test_df]) all_df.drop(columns=['y'], inplace=True) print(f"{all_df.duplicated().sum()}")
0
train
test
print(f"[tr] len: {len(train_df)}, nunique: {train_df['name'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['name'].nunique()}")
[tr] len: 9990, nunique: 9114 [te] len: 4996, nunique: 4754
venn2([set(train_df['name']), set(test_df['name'])], set_labels=('train', 'test'));
print(f"[tr] len: {len(train_df)}, nunique: {train_df['host_id'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['host_id'].nunique()}")
[tr] len: 9990, nunique: 2325 [te] len: 4996, nunique: 1254
venn2([set(train_df['host_id']), set(test_df['host_id'])], set_labels=('train', 'test'));
print(f"[tr] len: {len(train_df)}, nunique: {train_df['neighbourhood'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['neighbourhood'].nunique()}")
[tr] len: 9990, nunique: 23 [te] len: 4996, nunique: 23
venn2([set(train_df['neighbourhood']), set(test_df['neighbourhood'])], set_labels=('train', 'test'));
display( pd.DataFrame({ 'train': train_df['neighbourhood'].value_counts() / len(train_df), 'test': test_df['neighbourhood'].value_counts() / len(test_df) }).reset_index().sort_values('train', ascending=False) )
plt.figure(figsize=(25, 6)) sns.boxplot(x=train_df['neighbourhood'], y=train_df['y']) plt.show()
cond = train_df['y'] <= 50000 plt.figure(figsize=(25, 6)) sns.boxplot(x=train_df.loc[cond, 'neighbourhood'], y=train_df.loc[cond, 'y']) plt.show()
train_df[['neighbourhood', 'y']].groupby('neighbourhood').agg(['min', 'median', 'mean', 'max', 'std'])
print(f"[tr] len: {len(train_df)}, nunique: {train_df['room_type'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['room_type'].nunique()}")
[tr] len: 9990, nunique: 4 [te] len: 4996, nunique: 4
venn2([set(train_df['room_type']), set(test_df['room_type'])], set_labels=('train', 'test'));
display( pd.DataFrame({ 'train': train_df['room_type'].value_counts() / len(train_df), 'test': test_df['room_type'].value_counts() / len(test_df) }).reset_index().sort_values('train', ascending=False) )
plt.figure(figsize=(16, 6)) sns.boxplot(x=train_df['room_type'], y=train_df['y']) plt.show()
cond = train_df['y'] <= 50000 plt.figure(figsize=(25, 6)) sns.boxplot(x=train_df.loc[cond, 'room_type'], y=train_df.loc[cond, 'y']) plt.show()
train_df[['room_type', 'y']].groupby('room_type').agg(['min', 'median', 'mean', 'max', 'std'])
print(f"[tr] len: {len(train_df)}, nunique: {train_df['minimum_nights'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['minimum_nights'].nunique()}")
[tr] len: 9990, nunique: 30 [te] len: 4996, nunique: 26
display( pd.DataFrame({ 'train': train_df['minimum_nights'].value_counts() / len(train_df), 'test': test_df['minimum_nights'].value_counts() / len(test_df) }).reset_index().sort_values('train', ascending=False) )
display(train_df.loc[train_df['minimum_nights']>=100, :])
display(train_df.loc[train_df['minimum_nights']>train_df['availability_365'], :])
1486 rows × 13 columns
cond = train_df['minimum_nights'] < 50 plt.figure(figsize=(16, 8)) plt.scatter(train_df.loc[cond, 'minimum_nights'], train_df.loc[cond, 'y']) plt.show()
train_df['number_of_reviews'].hist() test_df['number_of_reviews'].hist();
print(f"[tr] len: {len(train_df)}, nunique: {train_df['number_of_reviews'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['number_of_reviews'].nunique()}")
[tr] len: 9990, nunique: 261 [te] len: 4996, nunique: 225
print('[tr] min:', train_df.loc[~train_df['number_of_reviews'].isnull(), 'number_of_reviews'].min()) print('[tr] max:', train_df.loc[~train_df['number_of_reviews'].isnull(), 'number_of_reviews'].max()) print('[te] min:', test_df.loc[~test_df['number_of_reviews'].isnull(), 'number_of_reviews'].min()) print('[te] max:', test_df.loc[~test_df['number_of_reviews'].isnull(), 'number_of_reviews'].max())
[tr] min: 0 [tr] max: 529 [te] min: 0 [te] max: 407
plt.figure(figsize=(16, 8)) plt.scatter(train_df['number_of_reviews'], train_df['y']) plt.show()
print('[tr] min:', train_df.loc[~train_df['last_review'].isnull(), 'last_review'].min()) print('[tr] max:', train_df.loc[~train_df['last_review'].isnull(), 'last_review'].max()) print('[te] min:', test_df.loc[~test_df['last_review'].isnull(), 'last_review'].min()) print('[te] max:', test_df.loc[~test_df['last_review'].isnull(), 'last_review'].max())
[tr] min: 2016-11-19 [tr] max: 2020-04-29 [te] min: 2015-05-25 [te] max: 2020-04-28
print(train_df['reviews_per_month'].isnull().sum()) print(train_df['last_review'].isnull().sum()) print(train_df.loc[train_df['reviews_per_month'].isnull() | train_df['last_review'].isnull(), 'last_review'].isnull().sum())
1699 1699 1699
print(test_df['reviews_per_month'].isnull().sum()) print(test_df['last_review'].isnull().sum()) print(test_df.loc[test_df['reviews_per_month'].isnull() | test_df['last_review'].isnull(), 'last_review'].isnull().sum())
832 832 832
2020-04-30
def get_last_review_days(df): days = datetime.date(2020, 4, 30) - pd.to_datetime(df['last_review']).dt.date days = days.astype('timedelta64[D]') days = days.astype('int') df['last_review_days'] = days return df
train_tmp_df = get_last_review_days(train_df.loc[~train_df['last_review'].isnull(), :]) test_tmp_df = get_last_review_days(test_df.loc[~test_df['last_review'].isnull(), :])
train_tmp_df['last_review_days'].hist() test_tmp_df['last_review_days'].hist();
print('[tr] min:', train_tmp_df['last_review_days'].min()) print('[tr] max:', train_tmp_df['last_review_days'].max()) print('[te] min:', test_tmp_df['last_review_days'].min()) print('[te] max:', test_tmp_df['last_review_days'].max())
[tr] min: 1 [tr] max: 1258 [te] min: 2 [te] max: 1802
plt.figure(figsize=(16, 8)) plt.scatter(train_tmp_df['last_review_days'], train_tmp_df['y']) plt.show()
train_df['reviews_per_month'].hist() test_df['reviews_per_month'].hist();
print(f"[tr] len: {len(train_df)}, nunique: {train_df['reviews_per_month'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['reviews_per_month'].nunique()}")
[tr] len: 9990, nunique: 594 [te] len: 4996, nunique: 501
print('[tr] min:', train_df['reviews_per_month'].min()) print('[tr] max:', train_df['reviews_per_month'].max()) print('[te] min:', test_df['reviews_per_month'].min()) print('[te] max:', test_df['reviews_per_month'].max())
[tr] min: 0.02 [tr] max: 43.6 [te] min: 0.03 [te] max: 19.7
plt.figure(figsize=(16, 8)) plt.scatter(train_df['reviews_per_month'], train_df['y']) plt.show()
train_df['availability_365'].hist() test_df['availability_365'].hist();
print(f"[tr] len: {len(train_df)}, nunique: {train_df['availability_365'].nunique()}") print(f"[te] len: {len(test_df)}, nunique: {test_df['availability_365'].nunique()}")
[tr] len: 9990, nunique: 366 [te] len: 4996, nunique: 364
print('[tr] min:', train_df['availability_365'].min()) print('[tr] max:', train_df['availability_365'].max()) print('[te] min:', test_df['availability_365'].min()) print('[te] max:', test_df['availability_365'].max())
[tr] min: 0 [tr] max: 365 [te] min: 0 [te] max: 365
plt.figure(figsize=(16, 8)) plt.scatter(train_df['availability_365'], train_df['y']) plt.show()
train_df['y'].hist();
np.log(train_df['y']).hist();