yshr10ic
train_data.csv
とtest_data.csv
を用いて簡単なEDAを実施してみました!pip install matplotlib-venn > /dev/null
import datetime
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import numpy as np
import os
import pandas as pd
import seaborn as sns
pd.options.display.float_format = '{:.2f}'.format
import warnings
warnings.simplefilter('ignore')
data_dir = '../data/datasets'
train_df = pd.read_csv(os.path.join(data_dir, 'train_data.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test_data.csv'))
station_df = pd.read_csv(os.path.join(data_dir, 'station_list.csv'))
submit_df = pd.read_csv(os.path.join(data_dir, 'submission.csv'))
print(f'{train_df.shape=}')
print(f'{test_df.shape=}')
print(f'{station_df.shape=}')
print(f'{submit_df.shape=}')
train_df.shape=(9990, 13) test_df.shape=(4996, 12) station_df.shape=(203, 3) submit_df.shape=(4996, 2)
display(train_df.head())
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre... | 242899459 | Koto Ku | 35.68 | 139.80 | Entire home/apt | 1 | 55 | 2020-04-25 | 2.21 | 173 | 12008 |
1 | 2 | Downtown Tokyo Iriya next to Ueno | 308879948 | Taito Ku | 35.72 | 139.79 | Entire home/apt | 6 | 72 | 2020-03-25 | 2.11 | 9 | 6667 |
2 | 3 | Japan Style,Private,Affordable,4min to Sta. | 300877823 | Katsushika Ku | 35.75 | 139.82 | Entire home/apt | 1 | 18 | 2020-03-23 | 3.46 | 288 | 9923 |
3 | 4 | 4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi | 236935461 | Shibuya Ku | 35.68 | 139.68 | Entire home/apt | 1 | 2 | 2020-04-02 | 1.76 | 87 | 8109 |
4 | 5 | LICENSED SHINJUKU HOUSE: Heart of the action! | 243408889 | Shinjuku Ku | 35.70 | 139.70 | Entire home/apt | 1 | 86 | 2020-01-30 | 2.00 | 156 | 100390 |
display(train_df.describe())
id | host_id | latitude | longitude | minimum_nights | number_of_reviews | reviews_per_month | availability_365 | y | |
---|---|---|---|---|---|---|---|---|---|
count | 9990.00 | 9990.00 | 9990.00 | 9990.00 | 9990.00 | 9990.00 | 8291.00 | 9990.00 | 9990.00 |
mean | 4995.50 | 174101304.02 | 35.70 | 139.74 | 3.33 | 25.94 | 1.70 | 157.20 | 25104.64 |
std | 2884.01 | 98387863.86 | 0.04 | 0.05 | 8.50 | 40.59 | 1.40 | 115.78 | 67049.85 |
min | 1.00 | 151977.00 | 35.54 | 139.58 | 1.00 | 0.00 | 0.02 | 0.00 | 921.00 |
25% | 2498.25 | 78803503.00 | 35.69 | 139.70 | 1.00 | 2.00 | 0.68 | 63.00 | 6960.75 |
50% | 4995.50 | 188123234.00 | 35.70 | 139.73 | 1.00 | 11.00 | 1.48 | 154.00 | 11892.50 |
75% | 7492.75 | 252666139.00 | 35.73 | 139.79 | 2.00 | 33.00 | 2.42 | 248.00 | 20085.00 |
max | 9990.00 | 344270090.00 | 35.82 | 139.91 | 365.00 | 529.00 | 43.60 | 365.00 | 1000103.00 |
display(train_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9990 entries, 0 to 9989 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 9990 non-null int64 1 name 9990 non-null object 2 host_id 9990 non-null int64 3 neighbourhood 9990 non-null object 4 latitude 9990 non-null float64 5 longitude 9990 non-null float64 6 room_type 9990 non-null object 7 minimum_nights 9990 non-null int64 8 number_of_reviews 9990 non-null int64 9 last_review 8291 non-null object 10 reviews_per_month 8291 non-null float64 11 availability_365 9990 non-null int64 12 y 9990 non-null int64 dtypes: float64(3), int64(6), object(4) memory usage: 1014.7+ KB
None
display(test_df.head())
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5-minute walk from Akasaka Sta, Superior double | 184730720 | Minato Ku | 35.67 | 139.73 | Private room | 1 | 0 | NaN | NaN | 183 |
1 | 2 | 7 min Sta.-Center of IKEBUKURO Cozy Room#503 | 20993205 | Toshima Ku | 35.73 | 139.72 | Entire home/apt | 2 | 21 | 2020-04-16 | 1.94 | 337 |
2 | 3 | Designer'sApt 1min sta☆Shinjuku 7min☆Shibuya 4min | 322521715 | Setagaya Ku | 35.66 | 139.67 | Entire home/apt | 1 | 14 | 2020-02-12 | 0.82 | 240 |
3 | 4 | Komagome Station 2 minutes on foot | 234477095 | Toshima Ku | 35.74 | 139.75 | Entire home/apt | 1 | 16 | 2020-02-17 | 1.19 | 0 |
4 | 5 | Monthly/Metro1min/JR5min/Ueno,Asakusa,Akihabara | 145453833 | Taito Ku | 35.72 | 139.78 | Entire home/apt | 30 | 2 | 2019-07-21 | 0.19 | 164 |
display(test_df.describe())
id | host_id | latitude | longitude | minimum_nights | number_of_reviews | reviews_per_month | availability_365 | |
---|---|---|---|---|---|---|---|---|
count | 4996.00 | 4996.00 | 4996.00 | 4996.00 | 4996.00 | 4996.00 | 4164.00 | 4996.00 |
mean | 2498.50 | 164873199.49 | 35.70 | 139.74 | 3.20 | 28.01 | 1.72 | 153.02 |
std | 1442.37 | 104265915.85 | 0.04 | 0.06 | 10.35 | 41.05 | 1.31 | 115.44 |
min | 1.00 | 771694.00 | 35.54 | 139.59 | 1.00 | 0.00 | 0.03 | 0.00 |
25% | 1249.75 | 58261300.00 | 35.69 | 139.70 | 1.00 | 2.00 | 0.75 | 62.00 |
50% | 2498.50 | 172626015.50 | 35.70 | 139.73 | 1.00 | 13.00 | 1.56 | 141.00 |
75% | 3747.25 | 256411204.00 | 35.72 | 139.79 | 2.00 | 36.00 | 2.41 | 246.00 |
max | 4996.00 | 344043769.00 | 35.80 | 139.91 | 365.00 | 407.00 | 19.70 | 365.00 |
display(test_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4996 entries, 0 to 4995 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 4996 non-null int64 1 name 4996 non-null object 2 host_id 4996 non-null int64 3 neighbourhood 4996 non-null object 4 latitude 4996 non-null float64 5 longitude 4996 non-null float64 6 room_type 4996 non-null object 7 minimum_nights 4996 non-null int64 8 number_of_reviews 4996 non-null int64 9 last_review 4164 non-null object 10 reviews_per_month 4164 non-null float64 11 availability_365 4996 non-null int64 dtypes: float64(3), int64(5), object(4) memory usage: 468.5+ KB
None
display(station_df.head())
station_name | longitude | latitude | |
---|---|---|---|
0 | 東京 | 139.77 | 35.68 |
1 | 新橋 | 139.76 | 35.67 |
2 | 品川 | 139.74 | 35.63 |
3 | 大崎 | 139.73 | 35.62 |
4 | 五反田 | 139.72 | 35.63 |
display(station_df.describe())
longitude | latitude | |
---|---|---|
count | 203.00 | 203.00 |
mean | 139.74 | 35.69 |
std | 0.05 | 0.04 |
min | 139.61 | 35.54 |
25% | 139.70 | 35.67 |
50% | 139.74 | 35.69 |
75% | 139.77 | 35.73 |
max | 139.87 | 35.78 |
display(station_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 203 entries, 0 to 202 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 station_name 203 non-null object 1 longitude 203 non-null float64 2 latitude 203 non-null float64 dtypes: float64(2), object(1) memory usage: 4.9+ KB
None
last_review
とreviews_per_month
で欠損値があるdisplay(
pd.DataFrame({
'train': train_df.isnull().sum(),
'test': test_df.isnull().sum()
}).reset_index()
)
index | train | test | |
---|---|---|---|
0 | availability_365 | 0 | 0.00 |
1 | host_id | 0 | 0.00 |
2 | id | 0 | 0.00 |
3 | last_review | 1699 | 832.00 |
4 | latitude | 0 | 0.00 |
5 | longitude | 0 | 0.00 |
6 | minimum_nights | 0 | 0.00 |
7 | name | 0 | 0.00 |
8 | neighbourhood | 0 | 0.00 |
9 | number_of_reviews | 0 | 0.00 |
10 | reviews_per_month | 1699 | 832.00 |
11 | room_type | 0 | 0.00 |
12 | y | 0 | NaN |
print(f"[tr] {train_df.duplicated().sum()}")
print(f"[te] {test_df.duplicated().sum()}")
[tr] 0 [te] 0
all_df = pd.concat([train_df, test_df])
all_df.drop(columns=['y'], inplace=True)
print(f"{all_df.duplicated().sum()}")
0
train
とtest
でも重複ありprint(f"[tr] len: {len(train_df)}, nunique: {train_df['name'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['name'].nunique()}")
[tr] len: 9990, nunique: 9114 [te] len: 4996, nunique: 4754
venn2([set(train_df['name']), set(test_df['name'])], set_labels=('train', 'test'));
train
とtest
では重複はないprint(f"[tr] len: {len(train_df)}, nunique: {train_df['host_id'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['host_id'].nunique()}")
[tr] len: 9990, nunique: 2325 [te] len: 4996, nunique: 1254
venn2([set(train_df['host_id']), set(test_df['host_id'])], set_labels=('train', 'test'));
train
とtest
で同じtrain
とtest
でレコード数の順番に大きな差はないが、test
の方が新宿区の割合が高くなっているprint(f"[tr] len: {len(train_df)}, nunique: {train_df['neighbourhood'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['neighbourhood'].nunique()}")
[tr] len: 9990, nunique: 23 [te] len: 4996, nunique: 23
venn2([set(train_df['neighbourhood']), set(test_df['neighbourhood'])], set_labels=('train', 'test'));
display(
pd.DataFrame({
'train': train_df['neighbourhood'].value_counts() / len(train_df),
'test': test_df['neighbourhood'].value_counts() / len(test_df)
}).reset_index().sort_values('train', ascending=False)
)
index | train | test | |
---|---|---|---|
18 | Shinjuku Ku | 0.17 | 0.21 |
21 | Taito Ku | 0.16 | 0.11 |
22 | Toshima Ku | 0.11 | 0.10 |
20 | Sumida Ku | 0.08 | 0.11 |
16 | Shibuya Ku | 0.07 | 0.08 |
4 | Chuo Ku | 0.05 | 0.03 |
11 | Minato Ku | 0.04 | 0.04 |
14 | Ota Ku | 0.03 | 0.04 |
12 | Nakano Ku | 0.03 | 0.03 |
1 | Arakawa Ku | 0.03 | 0.02 |
7 | Katsushika Ku | 0.03 | 0.03 |
15 | Setagaya Ku | 0.03 | 0.02 |
8 | Kita Ku | 0.02 | 0.03 |
6 | Itabashi Ku | 0.02 | 0.02 |
19 | Suginami Ku | 0.02 | 0.02 |
9 | Koto Ku | 0.02 | 0.01 |
3 | Chiyoda Ku | 0.02 | 0.01 |
5 | Edogawa Ku | 0.02 | 0.02 |
17 | Shinagawa Ku | 0.02 | 0.02 |
0 | Adachi Ku | 0.01 | 0.01 |
2 | Bunkyo Ku | 0.01 | 0.01 |
13 | Nerima Ku | 0.01 | 0.01 |
10 | Meguro Ku | 0.00 | 0.01 |
plt.figure(figsize=(25, 6))
sns.boxplot(x=train_df['neighbourhood'], y=train_df['y'])
plt.show()
cond = train_df['y'] <= 50000
plt.figure(figsize=(25, 6))
sns.boxplot(x=train_df.loc[cond, 'neighbourhood'], y=train_df.loc[cond, 'y'])
plt.show()
train_df[['neighbourhood', 'y']].groupby('neighbourhood').agg(['min', 'median', 'mean', 'max', 'std'])
y | |||||
---|---|---|---|---|---|
min | median | mean | max | std | |
neighbourhood | |||||
Adachi Ku | 1138 | 5992.50 | 9229.71 | 113899 | 13974.48 |
Arakawa Ku | 2375 | 8155.00 | 29292.74 | 300076 | 66255.71 |
Bunkyo Ku | 2821 | 14899.50 | 27752.41 | 100023 | 23440.60 |
Chiyoda Ku | 1583 | 11544.00 | 82929.30 | 900072 | 237491.24 |
Chuo Ku | 1999 | 19884.50 | 42940.40 | 100179 | 42685.88 |
Edogawa Ku | 1275 | 9934.00 | 54305.56 | 900077 | 167168.66 |
Itabashi Ku | 1106 | 6952.00 | 21884.25 | 300075 | 57985.75 |
Katsushika Ku | 1157 | 9485.00 | 25576.39 | 500058 | 61646.34 |
Kita Ku | 2007 | 9490.50 | 14327.90 | 149977 | 17997.69 |
Koto Ku | 1785 | 8992.00 | 15175.55 | 250056 | 25729.89 |
Meguro Ku | 2461 | 10906.00 | 14005.31 | 37970 | 8688.90 |
Minato Ku | 2683 | 14905.00 | 22049.52 | 999961 | 53838.95 |
Nakano Ku | 1520 | 11958.00 | 18938.62 | 200067 | 23215.39 |
Nerima Ku | 1176 | 5986.00 | 16594.46 | 500029 | 60913.03 |
Ota Ku | 2162 | 12065.00 | 15485.92 | 100045 | 15858.06 |
Setagaya Ku | 2502 | 8508.00 | 12588.77 | 79951 | 12077.33 |
Shibuya Ku | 2046 | 13984.00 | 26797.93 | 1000103 | 59916.99 |
Shinagawa Ku | 1483 | 11992.50 | 19234.70 | 149996 | 24353.24 |
Shinjuku Ku | 1388 | 11906.00 | 19001.67 | 900132 | 29691.65 |
Suginami Ku | 1944 | 7558.00 | 13412.81 | 100060 | 18831.11 |
Sumida Ku | 921 | 12688.00 | 23264.12 | 300434 | 33305.88 |
Taito Ku | 1374 | 15028.00 | 28606.30 | 900387 | 75333.16 |
Toshima Ku | 1808 | 9905.00 | 25977.99 | 983382 | 88482.36 |
train
とtest
で割合もほとんど同じprint(f"[tr] len: {len(train_df)}, nunique: {train_df['room_type'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['room_type'].nunique()}")
[tr] len: 9990, nunique: 4 [te] len: 4996, nunique: 4
venn2([set(train_df['room_type']), set(test_df['room_type'])], set_labels=('train', 'test'));
display(
pd.DataFrame({
'train': train_df['room_type'].value_counts() / len(train_df),
'test': test_df['room_type'].value_counts() / len(test_df)
}).reset_index().sort_values('train', ascending=False)
)
index | train | test | |
---|---|---|---|
0 | Entire home/apt | 0.69 | 0.70 |
2 | Private room | 0.21 | 0.22 |
1 | Hotel room | 0.05 | 0.04 |
3 | Shared room | 0.05 | 0.04 |
plt.figure(figsize=(16, 6))
sns.boxplot(x=train_df['room_type'], y=train_df['y'])
plt.show()
cond = train_df['y'] <= 50000
plt.figure(figsize=(25, 6))
sns.boxplot(x=train_df.loc[cond, 'room_type'], y=train_df.loc[cond, 'y'])
plt.show()
train_df[['room_type', 'y']].groupby('room_type').agg(['min', 'median', 'mean', 'max', 'std'])
y | |||||
---|---|---|---|---|---|
min | median | mean | max | std | |
room_type | |||||
Entire home/apt | 1423 | 13869.50 | 29799.55 | 1000103 | 73629.92 |
Hotel room | 1374 | 10081.00 | 30752.15 | 900071 | 101631.14 |
Private room | 1106 | 7547.00 | 12403.01 | 689471 | 24244.49 |
Shared room | 921 | 3479.00 | 6052.31 | 99018 | 8679.72 |
print(f"[tr] len: {len(train_df)}, nunique: {train_df['minimum_nights'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['minimum_nights'].nunique()}")
[tr] len: 9990, nunique: 30 [te] len: 4996, nunique: 26
display(
pd.DataFrame({
'train': train_df['minimum_nights'].value_counts() / len(train_df),
'test': test_df['minimum_nights'].value_counts() / len(test_df)
}).reset_index().sort_values('train', ascending=False)
)
index | train | test | |
---|---|---|---|
0 | 1 | 0.62 | 0.56 |
1 | 2 | 0.23 | 0.30 |
2 | 3 | 0.05 | 0.06 |
22 | 30 | 0.05 | 0.04 |
6 | 7 | 0.01 | 0.01 |
3 | 4 | 0.01 | 0.01 |
4 | 5 | 0.01 | 0.01 |
17 | 23 | 0.01 | NaN |
5 | 6 | 0.00 | 0.01 |
20 | 28 | 0.00 | 0.00 |
12 | 14 | 0.00 | 0.00 |
23 | 31 | 0.00 | 0.00 |
10 | 12 | 0.00 | 0.00 |
13 | 15 | 0.00 | 0.00 |
9 | 10 | 0.00 | 0.00 |
21 | 29 | 0.00 | 0.00 |
11 | 13 | 0.00 | 0.00 |
19 | 25 | 0.00 | NaN |
15 | 20 | 0.00 | 0.00 |
16 | 21 | 0.00 | 0.00 |
8 | 9 | 0.00 | NaN |
24 | 32 | 0.00 | 0.00 |
18 | 24 | 0.00 | NaN |
14 | 16 | 0.00 | NaN |
7 | 8 | 0.00 | 0.00 |
26 | 35 | 0.00 | NaN |
27 | 60 | 0.00 | 0.00 |
30 | 100 | 0.00 | NaN |
32 | 360 | 0.00 | NaN |
33 | 365 | 0.00 | 0.00 |
25 | 33 | NaN | 0.00 |
28 | 90 | NaN | 0.00 |
29 | 99 | NaN | 0.00 |
31 | 180 | NaN | 0.00 |
display(train_df.loc[train_df['minimum_nights']>=100, :])
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2103 | 2104 | EarlyCheckIn/NewAprt./Legal/ShibuyaSta.8mins/WiFi | 229764247 | Shibuya Ku | 35.65 | 139.71 | Entire home/apt | 100 | 62 | 2020-01-01 | 3.26 | 89 | 25086 |
4359 | 4360 | 0. ENTIRE APARTMENT 450ft 2LDK NEAR SHIBUYA/YO... | 46674978 | Nakano Ku | 35.71 | 139.69 | Entire home/apt | 365 | 1 | 2019-08-05 | 0.11 | 0 | 15107 |
4738 | 4739 | Etchujima - spacious, luxurious, family friendly | 278445388 | Koto Ku | 35.67 | 139.80 | Entire home/apt | 360 | 3 | 2019-02-23 | 0.14 | 301 | 9945 |
display(train_df.loc[train_df['minimum_nights']>train_df['availability_365'], :])
id | name | host_id | neighbourhood | latitude | longitude | room_type | minimum_nights | number_of_reviews | last_review | reviews_per_month | availability_365 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | 10 | 【2/F】Asakusabashi Building | for 6-9 ppl | 260891041 | Taito Ku | 35.70 | 139.79 | Entire home/apt | 2 | 74 | 2020-01-26 | 1.48 | 0 | 24826 |
13 | 14 | QY52 JR Train station 5-min walk, airport dire... | 25523994 | Taito Ku | 35.70 | 139.78 | Entire home/apt | 1 | 0 | NaN | NaN | 0 | 8208 |
30 | 31 | VIP新宿/Higashi Shinjuku station 3min/Bed size 1.8m | 38092840 | Shinjuku Ku | 35.70 | 139.71 | Entire home/apt | 2 | 24 | 2020-01-20 | 2.38 | 0 | 8799 |
34 | 35 | Can enjoy cosplay ramen shop☆Ikebukuro8min☆AS485 | 38975201 | Toshima Ku | 35.73 | 139.70 | Entire home/apt | 2 | 61 | 2020-02-25 | 1.61 | 0 | 12259 |
40 | 41 | Uhome Koiwa Apartment2, 503, 1mn walk from sta... | 165921683 | Edogawa Ku | 35.74 | 139.89 | Entire home/apt | 1 | 6 | 2019-11-12 | 0.66 | 0 | 99990 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9944 | 9945 | New open ! Shinjuku 東新宿、歌舞伎町、徒歩4分 1204 | 191116680 | Shinjuku Ku | 35.70 | 139.71 | Entire home/apt | 1 | 26 | 2019-11-28 | 1.62 | 0 | 19617 |
9961 | 9962 | My apartment is comfortable Located a 4-minute... | 137060281 | Shinjuku Ku | 35.70 | 139.70 | Private room | 1 | 106 | 2020-03-10 | 2.56 | 0 | 5423 |
9975 | 9976 | 山手線池袋3分鐘新宿8分鐘目白清新公寓 | 49661789 | Toshima Ku | 35.72 | 139.71 | Entire home/apt | 1 | 0 | NaN | NaN | 0 | 8644 |
9977 | 9978 | 匯家 KAIKE 浅草寺 Sensoji 10分可达 双人 背包客 短期促销开放3 | 213268738 | Taito Ku | 35.72 | 139.80 | Private room | 3 | 0 | NaN | NaN | 0 | 3439 |
9988 | 9989 | SHITARA HOUSE dormitory B | 316273494 | Katsushika Ku | 35.75 | 139.83 | Shared room | 1 | 1 | 2020-01-01 | 0.25 | 0 | 3950 |
1486 rows × 13 columns
cond = train_df['minimum_nights'] < 50
plt.figure(figsize=(16, 8))
plt.scatter(train_df.loc[cond, 'minimum_nights'], train_df.loc[cond, 'y'])
plt.show()
train_df['number_of_reviews'].hist()
test_df['number_of_reviews'].hist();
print(f"[tr] len: {len(train_df)}, nunique: {train_df['number_of_reviews'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['number_of_reviews'].nunique()}")
[tr] len: 9990, nunique: 261 [te] len: 4996, nunique: 225
print('[tr] min:', train_df.loc[~train_df['number_of_reviews'].isnull(), 'number_of_reviews'].min())
print('[tr] max:', train_df.loc[~train_df['number_of_reviews'].isnull(), 'number_of_reviews'].max())
print('[te] min:', test_df.loc[~test_df['number_of_reviews'].isnull(), 'number_of_reviews'].min())
print('[te] max:', test_df.loc[~test_df['number_of_reviews'].isnull(), 'number_of_reviews'].max())
[tr] min: 0 [tr] max: 529 [te] min: 0 [te] max: 407
plt.figure(figsize=(16, 8))
plt.scatter(train_df['number_of_reviews'], train_df['y'])
plt.show()
print('[tr] min:', train_df.loc[~train_df['last_review'].isnull(), 'last_review'].min())
print('[tr] max:', train_df.loc[~train_df['last_review'].isnull(), 'last_review'].max())
print('[te] min:', test_df.loc[~test_df['last_review'].isnull(), 'last_review'].min())
print('[te] max:', test_df.loc[~test_df['last_review'].isnull(), 'last_review'].max())
[tr] min: 2016-11-19 [tr] max: 2020-04-29 [te] min: 2015-05-25 [te] max: 2020-04-28
reviews_per_month
とlast_review
で欠損はすべて同じレコードで発生しているprint(train_df['reviews_per_month'].isnull().sum())
print(train_df['last_review'].isnull().sum())
print(train_df.loc[train_df['reviews_per_month'].isnull() | train_df['last_review'].isnull(), 'last_review'].isnull().sum())
1699 1699 1699
print(test_df['reviews_per_month'].isnull().sum())
print(test_df['last_review'].isnull().sum())
print(test_df.loc[test_df['reviews_per_month'].isnull() | test_df['last_review'].isnull(), 'last_review'].isnull().sum())
832 832 832
2020-04-30
から数えて何日前かdef get_last_review_days(df):
days = datetime.date(2020, 4, 30) - pd.to_datetime(df['last_review']).dt.date
days = days.astype('timedelta64[D]')
days = days.astype('int')
df['last_review_days'] = days
return df
train_tmp_df = get_last_review_days(train_df.loc[~train_df['last_review'].isnull(), :])
test_tmp_df = get_last_review_days(test_df.loc[~test_df['last_review'].isnull(), :])
train_tmp_df['last_review_days'].hist()
test_tmp_df['last_review_days'].hist();
print('[tr] min:', train_tmp_df['last_review_days'].min())
print('[tr] max:', train_tmp_df['last_review_days'].max())
print('[te] min:', test_tmp_df['last_review_days'].min())
print('[te] max:', test_tmp_df['last_review_days'].max())
[tr] min: 1 [tr] max: 1258 [te] min: 2 [te] max: 1802
plt.figure(figsize=(16, 8))
plt.scatter(train_tmp_df['last_review_days'], train_tmp_df['y'])
plt.show()
train_df['reviews_per_month'].hist()
test_df['reviews_per_month'].hist();
print(f"[tr] len: {len(train_df)}, nunique: {train_df['reviews_per_month'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['reviews_per_month'].nunique()}")
[tr] len: 9990, nunique: 594 [te] len: 4996, nunique: 501
print('[tr] min:', train_df['reviews_per_month'].min())
print('[tr] max:', train_df['reviews_per_month'].max())
print('[te] min:', test_df['reviews_per_month'].min())
print('[te] max:', test_df['reviews_per_month'].max())
[tr] min: 0.02 [tr] max: 43.6 [te] min: 0.03 [te] max: 19.7
plt.figure(figsize=(16, 8))
plt.scatter(train_df['reviews_per_month'], train_df['y'])
plt.show()
train_df['availability_365'].hist()
test_df['availability_365'].hist();
print(f"[tr] len: {len(train_df)}, nunique: {train_df['availability_365'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['availability_365'].nunique()}")
[tr] len: 9990, nunique: 366 [te] len: 4996, nunique: 364
print('[tr] min:', train_df['availability_365'].min())
print('[tr] max:', train_df['availability_365'].max())
print('[te] min:', test_df['availability_365'].min())
print('[te] max:', test_df['availability_365'].max())
[tr] min: 0 [tr] max: 365 [te] min: 0 [te] max: 365
plt.figure(figsize=(16, 8))
plt.scatter(train_df['availability_365'], train_df['y'])
plt.show()
train_df['y'].hist();
np.log(train_df['y']).hist();