maruyama
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
train = pd.read_csv("rawdata/train_data.csv")
test = pd.read_csv("rawdata/test_data.csv")
train.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 21000.0 | 10499.500000 | 6062.322162 | 0.00000 | 5249.750000 | 10499.500000 | 15749.250000 | 20999.000000 |
position | 21000.0 | 1.226857 | 1.224682 | 0.00000 | 0.000000 | 1.000000 | 2.000000 | 4.000000 |
age | 21000.0 | 33.132476 | 10.715241 | 18.00000 | 24.000000 | 30.000000 | 42.000000 | 67.000000 |
sex | 21000.0 | 1.498333 | 0.500009 | 1.00000 | 1.000000 | 1.000000 | 2.000000 | 2.000000 |
partner | 21000.0 | 0.499333 | 0.500011 | 0.00000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
num_child | 21000.0 | 0.999667 | 1.417459 | 0.00000 | 0.000000 | 0.000000 | 2.000000 | 9.000000 |
education | 21000.0 | 1.098571 | 1.116355 | 0.00000 | 0.000000 | 1.000000 | 2.000000 | 4.000000 |
service_length | 21000.0 | 12.303143 | 10.696823 | 0.00000 | 3.000000 | 9.000000 | 21.000000 | 49.000000 |
study_time | 21000.0 | 3.828476 | 3.312927 | 0.00000 | 1.000000 | 3.000000 | 6.000000 | 24.000000 |
commute | 21000.0 | 1.059910 | 0.665307 | 0.10000 | 0.500000 | 1.100000 | 1.500000 | 4.800000 |
overtime | 21000.0 | 12.126752 | 5.509408 | 0.00000 | 8.300000 | 12.100000 | 15.800000 | 31.900000 |
salary | 21000.0 | 361.170391 | 171.618501 | 110.62231 | 225.498117 | 315.224583 | 456.927443 | 1098.943632 |
test.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 9000.0 | 4499.500000 | 2598.220545 | 0.0 | 2249.75 | 4499.5 | 6749.25 | 8999.0 |
position | 9000.0 | 1.188000 | 1.208091 | 0.0 | 0.00 | 1.0 | 2.00 | 4.0 |
age | 9000.0 | 32.937778 | 10.704380 | 18.0 | 24.00 | 29.0 | 42.00 | 65.0 |
sex | 9000.0 | 1.488889 | 0.499904 | 1.0 | 1.00 | 1.0 | 2.00 | 2.0 |
partner | 9000.0 | 0.503667 | 0.500014 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
num_child | 9000.0 | 1.001222 | 1.405820 | 0.0 | 0.00 | 0.0 | 2.00 | 8.0 |
education | 9000.0 | 1.089444 | 1.112866 | 0.0 | 0.00 | 1.0 | 2.00 | 4.0 |
service_length | 9000.0 | 12.141333 | 10.698280 | 0.0 | 3.00 | 8.0 | 21.00 | 47.0 |
study_time | 9000.0 | 3.646333 | 3.290426 | 0.0 | 1.00 | 3.0 | 6.00 | 21.0 |
commute | 9000.0 | 1.051700 | 0.656505 | 0.1 | 0.50 | 1.1 | 1.50 | 4.8 |
overtime | 9000.0 | 7.301389 | 6.002899 | 0.0 | 2.00 | 6.5 | 11.50 | 29.5 |
bins = np.linspace(0, 32, 31)
train["overtime"].hist(bins = bins, alpha = 0.5, label = "train")
test["overtime"].hist(bins = bins, alpha = 0.5, label = "test")
plt.xlabel("overtime")
plt.ylabel("count")
plt.legend()
plt.show()