upura
import pandas as pd
import japanize_matplotlib
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
train = pd.read_csv('../datasets/data/train_data.csv')
test = pd.read_csv('../datasets/data/test_data.csv')
train.head()
id | position | age | area | sex | partner | num_child | education | service_length | study_time | commute | overtime | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 44 | 愛知県 | 2 | 1 | 2 | 1 | 24 | 2.0 | 1.6 | 9.2 | 428.074887 |
1 | 1 | 2 | 31 | 奈良県 | 1 | 0 | 0 | 0 | 13 | 9.0 | 0.7 | 12.4 | 317.930517 |
2 | 2 | 2 | 36 | 山口県 | 1 | 0 | 0 | 2 | 14 | 4.0 | 0.4 | 16.9 | 357.350316 |
3 | 3 | 0 | 22 | 東京都 | 2 | 0 | 0 | 0 | 4 | 3.0 | 0.4 | 6.1 | 201.310911 |
4 | 4 | 0 | 25 | 鹿児島県 | 2 | 0 | 0 | 1 | 5 | 3.0 | 0.2 | 4.9 | 178.067475 |
train.shape, test.shape
((21000, 13), (9000, 12))
train.isnull().sum()
id 0 position 0 age 0 area 0 sex 0 partner 0 num_child 0 education 0 service_length 0 study_time 0 commute 0 overtime 0 salary 0 dtype: int64
test.isnull().sum()
id 0 position 0 age 0 area 0 sex 0 partner 0 num_child 0 education 0 service_length 0 study_time 0 commute 0 overtime 0 dtype: int64
train['position'].value_counts().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f46a15f8>
train['age'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f45dd978>
train['area'].value_counts().plot.bar(figsize=(20, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f456b2e8>
train['sex'].value_counts().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f44755c0>
train['partner'].value_counts().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f4411f98>
train['num_child'].value_counts().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f43ebdd8>
train['education'].value_counts().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f4411c18>
train['service_length'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f434f320>
train['study_time'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f43b6320>
train['commute'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54ecfe7240>
train['overtime'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54f4047d30>
train['salary'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f54ecec51d0>