EDA

train_data.csvとtest_data.csvを用いて簡単なEDAを実施してみました
動作確認はローカル環境でしておりますが、複雑な処理はしておりませんので、必要なライブラリをインストールすればGoogle Colabでも問題なく動作すると思います
ご意見、ノートブックの不備などございましたら、コメントいただけると幸いです

!pip install matplotlib-venn > /dev/null

import datetime
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import numpy as np
import os
import pandas as pd
import seaborn as sns

pd.options.display.float_format = '{:.2f}'.format

import warnings
warnings.simplefilter('ignore')

data_dir = '../data/datasets'

train_df = pd.read_csv(os.path.join(data_dir, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test_data.csv'))
station_df = pd.read_csv(os.path.join(data_dir, 'station_list.csv'))
submit_df = pd.read_csv(os.path.join(data_dir, 'submission.csv'))

print(f'{train_df.shape=}')
print(f'{test_df.shape=}')
print(f'{station_df.shape=}')
print(f'{submit_df.shape=}')

train_df.shape=(9990, 13)
test_df.shape=(4996, 12)
station_df.shape=(203, 3)
submit_df.shape=(4996, 2)

display(train_df.head())

	id	name	host_id	neighbourhood	latitude	longitude	room_type	minimum_nights	number_of_reviews	last_review	reviews_per_month	availability_365	y
0	1	KiyosumiShirakawa 3min\|★SkyTree★\|WIFI\|Max4\|Tre...	242899459	Koto Ku	35.68	139.80	Entire home/apt	1	55	2020-04-25	2.21	173	12008
1	2	Downtown Tokyo Iriya next to Ueno	308879948	Taito Ku	35.72	139.79	Entire home/apt	6	72	2020-03-25	2.11	9	6667
2	3	Japan Style,Private,Affordable,4min to Sta.	300877823	Katsushika Ku	35.75	139.82	Entire home/apt	1	18	2020-03-23	3.46	288	9923
3	4	4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi	236935461	Shibuya Ku	35.68	139.68	Entire home/apt	1	2	2020-04-02	1.76	87	8109
4	5	LICENSED SHINJUKU HOUSE: Heart of the action!	243408889	Shinjuku Ku	35.70	139.70	Entire home/apt	1	86	2020-01-30	2.00	156	100390

display(train_df.describe())

	id	host_id	latitude	longitude	minimum_nights	number_of_reviews	reviews_per_month	availability_365	y
count	9990.00	9990.00	9990.00	9990.00	9990.00	9990.00	8291.00	9990.00	9990.00
mean	4995.50	174101304.02	35.70	139.74	3.33	25.94	1.70	157.20	25104.64
std	2884.01	98387863.86	0.04	0.05	8.50	40.59	1.40	115.78	67049.85
min	1.00	151977.00	35.54	139.58	1.00	0.00	0.02	0.00	921.00
25%	2498.25	78803503.00	35.69	139.70	1.00	2.00	0.68	63.00	6960.75
50%	4995.50	188123234.00	35.70	139.73	1.00	11.00	1.48	154.00	11892.50
75%	7492.75	252666139.00	35.73	139.79	2.00	33.00	2.42	248.00	20085.00
max	9990.00	344270090.00	35.82	139.91	365.00	529.00	43.60	365.00	1000103.00

display(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9990 entries, 0 to 9989
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 9990 non-null   int64  
 1   name               9990 non-null   object 
 2   host_id            9990 non-null   int64  
 3   neighbourhood      9990 non-null   object 
 4   latitude           9990 non-null   float64
 5   longitude          9990 non-null   float64
 6   room_type          9990 non-null   object 
 7   minimum_nights     9990 non-null   int64  
 8   number_of_reviews  9990 non-null   int64  
 9   last_review        8291 non-null   object 
 10  reviews_per_month  8291 non-null   float64
 11  availability_365   9990 non-null   int64  
 12  y                  9990 non-null   int64  
dtypes: float64(3), int64(6), object(4)
memory usage: 1014.7+ KB

None

display(test_df.head())

	id	name	host_id	neighbourhood	latitude	longitude	room_type	minimum_nights	number_of_reviews	last_review	reviews_per_month	availability_365
0	1	5-minute walk from Akasaka Sta, Superior double	184730720	Minato Ku	35.67	139.73	Private room	1	0	NaN	NaN	183
1	2	7 min Sta.-Center of IKEBUKURO Cozy Room#503	20993205	Toshima Ku	35.73	139.72	Entire home/apt	2	21	2020-04-16	1.94	337
2	3	Designer'sApt 1min sta☆Shinjuku 7min☆Shibuya 4min	322521715	Setagaya Ku	35.66	139.67	Entire home/apt	1	14	2020-02-12	0.82	240
3	4	Komagome Station 2 minutes on foot	234477095	Toshima Ku	35.74	139.75	Entire home/apt	1	16	2020-02-17	1.19	0
4	5	Monthly/Metro1min/JR5min/Ueno,Asakusa,Akihabara	145453833	Taito Ku	35.72	139.78	Entire home/apt	30	2	2019-07-21	0.19	164

display(test_df.describe())

	id	host_id	latitude	longitude	minimum_nights	number_of_reviews	reviews_per_month	availability_365
count	4996.00	4996.00	4996.00	4996.00	4996.00	4996.00	4164.00	4996.00
mean	2498.50	164873199.49	35.70	139.74	3.20	28.01	1.72	153.02
std	1442.37	104265915.85	0.04	0.06	10.35	41.05	1.31	115.44
min	1.00	771694.00	35.54	139.59	1.00	0.00	0.03	0.00
25%	1249.75	58261300.00	35.69	139.70	1.00	2.00	0.75	62.00
50%	2498.50	172626015.50	35.70	139.73	1.00	13.00	1.56	141.00
75%	3747.25	256411204.00	35.72	139.79	2.00	36.00	2.41	246.00
max	4996.00	344043769.00	35.80	139.91	365.00	407.00	19.70	365.00

display(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4996 entries, 0 to 4995
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4996 non-null   int64  
 1   name               4996 non-null   object 
 2   host_id            4996 non-null   int64  
 3   neighbourhood      4996 non-null   object 
 4   latitude           4996 non-null   float64
 5   longitude          4996 non-null   float64
 6   room_type          4996 non-null   object 
 7   minimum_nights     4996 non-null   int64  
 8   number_of_reviews  4996 non-null   int64  
 9   last_review        4164 non-null   object 
 10  reviews_per_month  4164 non-null   float64
 11  availability_365   4996 non-null   int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 468.5+ KB

None

display(station_df.head())

	station_name	longitude	latitude
0	東京	139.77	35.68
1	新橋	139.76	35.67
2	品川	139.74	35.63
3	大崎	139.73	35.62
4	五反田	139.72	35.63

display(station_df.describe())

	longitude	latitude
count	203.00	203.00
mean	139.74	35.69
std	0.05	0.04
min	139.61	35.54
25%	139.70	35.67
50%	139.74	35.69
75%	139.77	35.73
max	139.87	35.78

display(station_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station_name  203 non-null    object 
 1   longitude     203 non-null    float64
 2   latitude      203 non-null    float64
dtypes: float64(2), object(1)
memory usage: 4.9+ KB

None

train_df, test_df

check null

last_reviewとreviews_per_monthで欠損値がある

display(
    pd.DataFrame({
        'train': train_df.isnull().sum(),
        'test': test_df.isnull().sum()
    }).reset_index()
)

	index	train	test
0	availability_365	0	0.00
1	host_id	0	0.00
2	id	0	0.00
3	last_review	1699	832.00
4	latitude	0	0.00
5	longitude	0	0.00
6	minimum_nights	0	0.00
7	name	0	0.00
8	neighbourhood	0	0.00
9	number_of_reviews	0	0.00
10	reviews_per_month	1699	832.00
11	room_type	0	0.00
12	y	0	NaN

check duplicated

重複はない

print(f"[tr] {train_df.duplicated().sum()}")
print(f"[te] {test_df.duplicated().sum()}")

[tr] 0
[te] 0

all_df = pd.concat([train_df, test_df])
all_df.drop(columns=['y'], inplace=True)

print(f"{all_df.duplicated().sum()}")

name

一部重複あり
trainとtestでも重複あり

print(f"[tr] len: {len(train_df)}, nunique: {train_df['name'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['name'].nunique()}")

[tr] len: 9990, nunique: 9114
[te] len: 4996, nunique: 4754

venn2([set(train_df['name']), set(test_df['name'])], set_labels=('train', 'test'));

host_id

重複あり
trainとtestでは重複はない

print(f"[tr] len: {len(train_df)}, nunique: {train_df['host_id'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['host_id'].nunique()}")

[tr] len: 9990, nunique: 2325
[te] len: 4996, nunique: 1254

venn2([set(train_df['host_id']), set(test_df['host_id'])], set_labels=('train', 'test'));

neighbourhood

２３区のローマ字表記
trainとtestで同じ
trainとtestでレコード数の順番に大きな差はないが、testの方が新宿区の割合が高くなっている

print(f"[tr] len: {len(train_df)}, nunique: {train_df['neighbourhood'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['neighbourhood'].nunique()}")

[tr] len: 9990, nunique: 23
[te] len: 4996, nunique: 23

venn2([set(train_df['neighbourhood']), set(test_df['neighbourhood'])], set_labels=('train', 'test'));

display(
    pd.DataFrame({
        'train': train_df['neighbourhood'].value_counts() / len(train_df),
        'test': test_df['neighbourhood'].value_counts() / len(test_df)
    }).reset_index().sort_values('train', ascending=False)
)

	index	train	test
18	Shinjuku Ku	0.17	0.21
21	Taito Ku	0.16	0.11
22	Toshima Ku	0.11	0.10
20	Sumida Ku	0.08	0.11
16	Shibuya Ku	0.07	0.08
4	Chuo Ku	0.05	0.03
11	Minato Ku	0.04	0.04
14	Ota Ku	0.03	0.04
12	Nakano Ku	0.03	0.03
1	Arakawa Ku	0.03	0.02
7	Katsushika Ku	0.03	0.03
15	Setagaya Ku	0.03	0.02
8	Kita Ku	0.02	0.03
6	Itabashi Ku	0.02	0.02
19	Suginami Ku	0.02	0.02
9	Koto Ku	0.02	0.01
3	Chiyoda Ku	0.02	0.01
5	Edogawa Ku	0.02	0.02
17	Shinagawa Ku	0.02	0.02
0	Adachi Ku	0.01	0.01
2	Bunkyo Ku	0.01	0.01
13	Nerima Ku	0.01	0.01
10	Meguro Ku	0.00	0.01

plt.figure(figsize=(25, 6))
sns.boxplot(x=train_df['neighbourhood'], y=train_df['y'])
plt.show()

yの値が50000以下のレコードで表示
文京区だけ以上に高い

cond = train_df['y'] <= 50000
plt.figure(figsize=(25, 6))
sns.boxplot(x=train_df.loc[cond, 'neighbourhood'], y=train_df.loc[cond, 'y'])
plt.show()

train_df[['neighbourhood', 'y']].groupby('neighbourhood').agg(['min', 'median', 'mean', 'max', 'std'])

	y
	min	median	mean	max	std
neighbourhood
Adachi Ku	1138	5992.50	9229.71	113899	13974.48
Arakawa Ku	2375	8155.00	29292.74	300076	66255.71
Bunkyo Ku	2821	14899.50	27752.41	100023	23440.60
Chiyoda Ku	1583	11544.00	82929.30	900072	237491.24
Chuo Ku	1999	19884.50	42940.40	100179	42685.88
Edogawa Ku	1275	9934.00	54305.56	900077	167168.66
Itabashi Ku	1106	6952.00	21884.25	300075	57985.75
Katsushika Ku	1157	9485.00	25576.39	500058	61646.34
Kita Ku	2007	9490.50	14327.90	149977	17997.69
Koto Ku	1785	8992.00	15175.55	250056	25729.89
Meguro Ku	2461	10906.00	14005.31	37970	8688.90
Minato Ku	2683	14905.00	22049.52	999961	53838.95
Nakano Ku	1520	11958.00	18938.62	200067	23215.39
Nerima Ku	1176	5986.00	16594.46	500029	60913.03
Ota Ku	2162	12065.00	15485.92	100045	15858.06
Setagaya Ku	2502	8508.00	12588.77	79951	12077.33
Shibuya Ku	2046	13984.00	26797.93	1000103	59916.99
Shinagawa Ku	1483	11992.50	19234.70	149996	24353.24
Shinjuku Ku	1388	11906.00	19001.67	900132	29691.65
Suginami Ku	1944	7558.00	13412.81	100060	18831.11
Sumida Ku	921	12688.00	23264.12	300434	33305.88
Taito Ku	1374	15028.00	28606.30	900387	75333.16
Toshima Ku	1808	9905.00	25977.99	983382	88482.36

room_type

4種類
trainとtestで割合もほとんど同じ

print(f"[tr] len: {len(train_df)}, nunique: {train_df['room_type'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['room_type'].nunique()}")

[tr] len: 9990, nunique: 4
[te] len: 4996, nunique: 4

venn2([set(train_df['room_type']), set(test_df['room_type'])], set_labels=('train', 'test'));

display(
    pd.DataFrame({
        'train': train_df['room_type'].value_counts() / len(train_df),
        'test': test_df['room_type'].value_counts() / len(test_df)
    }).reset_index().sort_values('train', ascending=False)
)

	index	train	test
0	Entire home/apt	0.69	0.70
2	Private room	0.21	0.22
1	Hotel room	0.05	0.04
3	Shared room	0.05	0.04

plt.figure(figsize=(16, 6))
sns.boxplot(x=train_df['room_type'], y=train_df['y'])
plt.show()

yの値が50000以下のレコードで表示
シェアルームが他に比べて低いことが分かる

cond = train_df['y'] <= 50000
plt.figure(figsize=(25, 6))
sns.boxplot(x=train_df.loc[cond, 'room_type'], y=train_df.loc[cond, 'y'])
plt.show()

train_df[['room_type', 'y']].groupby('room_type').agg(['min', 'median', 'mean', 'max', 'std'])

	y
	min	median	mean	max	std
room_type
Entire home/apt	1423	13869.50	29799.55	1000103	73629.92
Hotel room	1374	10081.00	30752.15	900071	101631.14
Private room	1106	7547.00	12403.01	689471	24244.49
Shared room	921	3479.00	6052.31	99018	8679.72

minimum_nights

最短宿泊日数
ほとんどの宿が1日もしくは2日

print(f"[tr] len: {len(train_df)}, nunique: {train_df['minimum_nights'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['minimum_nights'].nunique()}")

[tr] len: 9990, nunique: 30
[te] len: 4996, nunique: 26

display(
    pd.DataFrame({
        'train': train_df['minimum_nights'].value_counts() / len(train_df),
        'test': test_df['minimum_nights'].value_counts() / len(test_df)
    }).reset_index().sort_values('train', ascending=False)
)

	index	train	test
0	1	0.62	0.56
1	2	0.23	0.30
2	3	0.05	0.06
22	30	0.05	0.04
6	7	0.01	0.01
3	4	0.01	0.01
4	5	0.01	0.01
17	23	0.01	NaN
5	6	0.00	0.01
20	28	0.00	0.00
12	14	0.00	0.00
23	31	0.00	0.00
10	12	0.00	0.00
13	15	0.00	0.00
9	10	0.00	0.00
21	29	0.00	0.00
11	13	0.00	0.00
19	25	0.00	NaN
15	20	0.00	0.00
16	21	0.00	0.00
8	9	0.00	NaN
24	32	0.00	0.00
18	24	0.00	NaN
14	16	0.00	NaN
7	8	0.00	0.00
26	35	0.00	NaN
27	60	0.00	0.00
30	100	0.00	NaN
32	360	0.00	NaN
33	365	0.00	0.00
25	33	NaN	0.00
28	90	NaN	0.00
29	99	NaN	0.00
31	180	NaN	0.00

最短宿泊日数が100日以上の宿を表示
最短宿泊日数が年間の宿泊可能日数より多いので異常値かもしれない

display(train_df.loc[train_df['minimum_nights']>=100, :])

	id	name	host_id	neighbourhood	latitude	longitude	room_type	minimum_nights	number_of_reviews	last_review	reviews_per_month	availability_365	y
2103	2104	EarlyCheckIn/NewAprt./Legal/ShibuyaSta.8mins/WiFi	229764247	Shibuya Ku	35.65	139.71	Entire home/apt	100	62	2020-01-01	3.26	89	25086
4359	4360	0. ENTIRE APARTMENT 450ft 2LDK NEAR SHIBUYA/YO...	46674978	Nakano Ku	35.71	139.69	Entire home/apt	365	1	2019-08-05	0.11	0	15107
4738	4739	Etchujima - spacious, luxurious, family friendly	278445388	Koto Ku	35.67	139.80	Entire home/apt	360	3	2019-02-23	0.14	301	9945

最短宿泊日数が年間の宿泊可能日数より多いのでレコードを表示
1486レコードと全体の約15％となる

display(train_df.loc[train_df['minimum_nights']>train_df['availability_365'], :])

	id	name	host_id	neighbourhood	latitude	longitude	room_type	minimum_nights	number_of_reviews	last_review	reviews_per_month	availability_365	y
9	10	【2/F】Asakusabashi Building \| for 6-9 ppl	260891041	Taito Ku	35.70	139.79	Entire home/apt	2	74	2020-01-26	1.48	0	24826
13	14	QY52 JR Train station 5-min walk, airport dire...	25523994	Taito Ku	35.70	139.78	Entire home/apt	1	0	NaN	NaN	0	8208
30	31	VIP新宿/Higashi Shinjuku station 3min/Bed size 1.8m	38092840	Shinjuku Ku	35.70	139.71	Entire home/apt	2	24	2020-01-20	2.38	0	8799
34	35	Can enjoy cosplay ramen shop☆Ikebukuro8min☆AS485	38975201	Toshima Ku	35.73	139.70	Entire home/apt	2	61	2020-02-25	1.61	0	12259
40	41	Uhome Koiwa Apartment2, 503, 1mn walk from sta...	165921683	Edogawa Ku	35.74	139.89	Entire home/apt	1	6	2019-11-12	0.66	0	99990
...	...	...	...	...	...	...	...	...	...	...	...	...	...
9944	9945	New open ! Shinjuku 東新宿､歌舞伎町､徒歩4分 1204	191116680	Shinjuku Ku	35.70	139.71	Entire home/apt	1	26	2019-11-28	1.62	0	19617
9961	9962	My apartment is comfortable Located a 4-minute...	137060281	Shinjuku Ku	35.70	139.70	Private room	1	106	2020-03-10	2.56	0	5423
9975	9976	山手線池袋3分鐘新宿8分鐘目白清新公寓	49661789	Toshima Ku	35.72	139.71	Entire home/apt	1	0	NaN	NaN	0	8644
9977	9978	匯家 KAIKE 浅草寺 Sensoji 10分可达双人背包客短期促销开放3	213268738	Taito Ku	35.72	139.80	Private room	3	0	NaN	NaN	0	3439
9988	9989	SHITARA HOUSE dormitory B	316273494	Katsushika Ku	35.75	139.83	Shared room	1	1	2020-01-01	0.25	0	3950

1486 rows × 13 columns

cond = train_df['minimum_nights'] < 50

plt.figure(figsize=(16, 8))
plt.scatter(train_df.loc[cond, 'minimum_nights'], train_df.loc[cond, 'y'])
plt.show()

number_of_reviews

train_df['number_of_reviews'].hist()
test_df['number_of_reviews'].hist();

print(f"[tr] len: {len(train_df)}, nunique: {train_df['number_of_reviews'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['number_of_reviews'].nunique()}")

[tr] len: 9990, nunique: 261
[te] len: 4996, nunique: 225

print('[tr] min:', train_df.loc[~train_df['number_of_reviews'].isnull(), 'number_of_reviews'].min())
print('[tr] max:', train_df.loc[~train_df['number_of_reviews'].isnull(), 'number_of_reviews'].max())
print('[te] min:', test_df.loc[~test_df['number_of_reviews'].isnull(), 'number_of_reviews'].min())
print('[te] max:', test_df.loc[~test_df['number_of_reviews'].isnull(), 'number_of_reviews'].max())

[tr] min: 0
[tr] max: 529
[te] min: 0
[te] max: 407

plt.figure(figsize=(16, 8))
plt.scatter(train_df['number_of_reviews'], train_df['y'])
plt.show()

last_review

print('[tr] min:', train_df.loc[~train_df['last_review'].isnull(), 'last_review'].min())
print('[tr] max:', train_df.loc[~train_df['last_review'].isnull(), 'last_review'].max())
print('[te] min:', test_df.loc[~test_df['last_review'].isnull(), 'last_review'].min())
print('[te] max:', test_df.loc[~test_df['last_review'].isnull(), 'last_review'].max())

[tr] min: 2016-11-19
[tr] max: 2020-04-29
[te] min: 2015-05-25
[te] max: 2020-04-28

reviews_per_monthとlast_reviewで欠損はすべて同じレコードで発生している

print(train_df['reviews_per_month'].isnull().sum())
print(train_df['last_review'].isnull().sum())
print(train_df.loc[train_df['reviews_per_month'].isnull() | train_df['last_review'].isnull(), 'last_review'].isnull().sum())

1699
1699
1699

print(test_df['reviews_per_month'].isnull().sum())
print(test_df['last_review'].isnull().sum())
print(test_df.loc[test_df['reviews_per_month'].isnull() | test_df['last_review'].isnull(), 'last_review'].isnull().sum())

832
832
832

最終レビューが2020-04-30から数えて何日前か

def get_last_review_days(df):
    days = datetime.date(2020, 4, 30) - pd.to_datetime(df['last_review']).dt.date
    days = days.astype('timedelta64[D]')
    days = days.astype('int')
    
    df['last_review_days'] = days
    
    return df

train_tmp_df = get_last_review_days(train_df.loc[~train_df['last_review'].isnull(), :])
test_tmp_df = get_last_review_days(test_df.loc[~test_df['last_review'].isnull(), :])

train_tmp_df['last_review_days'].hist()
test_tmp_df['last_review_days'].hist();

print('[tr] min:', train_tmp_df['last_review_days'].min())
print('[tr] max:', train_tmp_df['last_review_days'].max())
print('[te] min:', test_tmp_df['last_review_days'].min())
print('[te] max:', test_tmp_df['last_review_days'].max())

[tr] min: 1
[tr] max: 1258
[te] min: 2
[te] max: 1802

plt.figure(figsize=(16, 8))
plt.scatter(train_tmp_df['last_review_days'], train_tmp_df['y'])
plt.show()

reviews_per_month

train_df['reviews_per_month'].hist()
test_df['reviews_per_month'].hist();

print(f"[tr] len: {len(train_df)}, nunique: {train_df['reviews_per_month'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['reviews_per_month'].nunique()}")

[tr] len: 9990, nunique: 594
[te] len: 4996, nunique: 501

print('[tr] min:', train_df['reviews_per_month'].min())
print('[tr] max:', train_df['reviews_per_month'].max())
print('[te] min:', test_df['reviews_per_month'].min())
print('[te] max:', test_df['reviews_per_month'].max())

[tr] min: 0.02
[tr] max: 43.6
[te] min: 0.03
[te] max: 19.7

plt.figure(figsize=(16, 8))
plt.scatter(train_df['reviews_per_month'], train_df['y'])
plt.show()

availability_365

非常にいびつな形状をしている
0〜365日まである

train_df['availability_365'].hist()
test_df['availability_365'].hist();

print(f"[tr] len: {len(train_df)}, nunique: {train_df['availability_365'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['availability_365'].nunique()}")

[tr] len: 9990, nunique: 366
[te] len: 4996, nunique: 364

print('[tr] min:', train_df['availability_365'].min())
print('[tr] max:', train_df['availability_365'].max())
print('[te] min:', test_df['availability_365'].min())
print('[te] max:', test_df['availability_365'].max())

[tr] min: 0
[tr] max: 365
[te] min: 0
[te] max: 365

plt.figure(figsize=(16, 8))
plt.scatter(train_df['availability_365'], train_df['y'])
plt.show()

y

対数を取れば比較的きれいな正規分布の形状となる

train_df['y'].hist();

np.log(train_df['y']).hist();

EDA

EDA

train_df, test_df

check null

check duplicated

name

host_id

neighbourhood

room_type

minimum_nights

number_of_reviews

last_review

reviews_per_month

availability_365

y

添付データ

new user