hirayuki
[動作環境] Google colaboratory
※lightgbmのコードを書いたのは人生で2度目です。ところどころ初心者の書き方をしているかもしれないので、お手本というよりはご参考までに見ていただけると幸いです。
from google.colab import drive
drive.mount('/content/drive/')
# ご自身の環境に合わせてください
%cd /content/drive/My\ Drive/予測コンペ/ProbSpace/給与推定コンペ
!pip install optuna
import featuretools as ft
import lightgbm as lgb
import optuna
import numpy as np
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
train = pd.read_csv("train_data.csv")
y = train[["id", "salary"]]
X = train.drop("salary", axis=1).drop("id", axis=1)
print("train shape is " + str(train.shape))
print("target shape is " + str(y.shape))
test = pd.read_csv("test_data.csv")
print("test shape is " + str(test.shape))
cat_features = ["position", "sex", "partner", "education", "area"]
num_features = ["age", "num_child", "service_length", "study_time", "commute", "overtime"]
# 全て本やブログからの思いつきです
def create_features(df):
def overtime_zeto2median(df):
# 不自然に残業時間0の層があるため、欠損値的な扱いをして給与の中央値を入れる
df_ex_zero = df[df["overtime"]>0][["overtime"]]
df.loc[df["overtime"] <= 0, "overtime"] = df_ex_zero["overtime"].median()
return df
def live_in_city(df):
df["isCity"] = df["area"].isin(["東京都", "大阪府"]).astype(int)
if "isCity" not in cat_features:
cat_features.append("isCity")
return df
def sex_and_position(df):
df["sex_and_position"] = df["sex"].astype(str) + "_" + df["position"].astype(str)
if "sex_and_position" not in cat_features:
cat_features.append("sex_and_position")
return df
def age_layer(df):
df["age_layer"] = 0
df.loc[20>=df["age"], "age_layer"] = 0
df.loc[(30>=df["age"])&(df["age"]>20), "age_layer"] = 1
df.loc[df["age"]>30, "age_layer"] = 2
if "age_layer" not in cat_features:
cat_features.append("age_layer")
return df
def agelayer_and_position(df):
df["agelayer_and_position"] = df["age_layer"].astype(str) + "_" + df["position"].astype(str)
if "agelayer_and_position" not in cat_features:
cat_features.append("agelayer_and_position")
return df
def education_and_position(df):
df["education_and_position"] = df["education"].astype(str) + "_" + df["position"].astype(str)
if "education_and_position" not in cat_features:
cat_features.append("education_and_position")
return df
def age_diff_service_length(df):
df["age_diff_service_length"] = df["age"] / (df["service_length"]+1)
if "age_diff_service_length" not in num_features:
num_features.append("age_diff_service_length")
return df
def adjust_commute(df):
df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1), "commute"] = \
df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x/2 - 1, 0))
df.loc[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1), "commute"] = \
df[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x-1, 0))
df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0), "commute"] = \
df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0)]["commute"].apply(lambda x : max(x-1, 0))
return df
def overtime_by_service_length(df):
df["overtime_by_service_length"] = df["overtime"] / (df["service_length"]+1)
if "overtime_by_service_length" not in num_features:
num_features.append("overtime_by_service_length")
return df
def overtime_by_age(df):
df["overtime_by_age"] = df["overtime"] / df["age"]
if "overtime_by_age" not in num_features:
num_features.append("overtime_by_age")
return df
def study_time_by_service_length(df):
df["study_time_by_service_length"] = df["study_time"] / (df["service_length"]+1)
if "study_time_by_service_length" not in num_features:
num_features.append("study_time_by_service_length")
return df
def study_time_by_age(df):
df["study_time_by_age"] = df["study_time"] / df["age"]
if "study_time_by_age" not in num_features:
num_features.append("study_time_by_age")
return df
def age_by_service_length(df):
df["age_by_service_length"] = df["age"] / (df["service_length"]+1)
if "age_by_service_length" not in num_features:
num_features.append("age_by_service_length")
return df
df = overtime_zeto2median(df)
df = live_in_city(df)
df = sex_and_position(df)
df = education_and_position(df)
df = adjust_commute(df)
df = age_diff_service_length(df)
df = age_layer(df)
df = agelayer_and_position(df)
df = overtime_by_service_length(df)
df = overtime_by_age(df)
df = study_time_by_service_length(df)
df = study_time_by_age(df)
df = age_by_service_length(df)
return df
for df in [X, test]:
df = create_features(df)
scalar = StandardScaler()
scalar.fit(X[num_features])
for df in [X, test]:
df[num_features] = scalar.transform(df[num_features])
le = preprocessing.LabelEncoder() # あとでtarget encodingするので不要?
for column in cat_features:
le.fit(df[column])
label_encoded_column = le.transform(df[column])
df[column] = pd.Series(label_encoded_column).astype('category')
target = y["salary"]
for c in cat_features:
# 学習データ全体で、各カテゴリの置けるtargetの平均を計算
data_tmp = pd.DataFrame({c: X[c], "target": target})
target_mean = data_tmp.groupby(c)["target"].mean()
# テストデータのカテゴリを置換
test[c] = test[c].map(target_mean).astype(np.float)
# 学習データの変換後の値を格納する配列を準備
tmp = np.repeat(np.nan, X.shape[0])
# 学習データを分割
kf = KFold(n_splits=4, shuffle=True, random_state=42)
for idx_1, idx_2 in kf.split(X):
target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()
tmp[idx_2] = X[c].iloc[idx_2].map(target_mean)
X[c] = tmp
y_values = y["salary"]
def objective(trial):
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2'},
'verbosity': -1,
"seed":42,
"learning_rate":trial.suggest_loguniform('lambda_l1', 0.005, 0.03),
'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
'num_leaves': trial.suggest_int('num_leaves', 2, 256),
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
}
num_round = 10000
FOLD_NUM = 5
models = []
kf = KFold(n_splits=FOLD_NUM, random_state=42)
scores = []
feature_importance_df = pd.DataFrame()
pred_cv = np.zeros(len(test.index))
for i, (tdx, vdx) in enumerate(kf.split(X, y)):
print(f'Fold : {i}')
X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y_values[tdx], y_values[vdx]
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)
model = lgb.train(params, lgb_train, num_boost_round=num_round,
#categorical_feature=cat_features,
valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
early_stopping_rounds=10)
va_pred = model.predict(X_valid)
score_ = -mean_squared_error(y_valid.values, va_pred) # 改良の余地あり
print(score_)
scores.append(score_)
models.append(model)
return np.mean(scores)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
# 結果の確認
print('Best trial:')
trial = study.best_trial
print(' Value: {}'.format(trial.value))
print(' Params: ')
for key, value in trial.params.items():
print(' "{}": {},'.format(key, value))
# Optunaの最適化パラメータを代入する
params = {'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2'},
'verbosity': -1,
"seed":42,}
params.update(trial.params)
params
models = []
FOLD_NUM = 5
kf = KFold(n_splits=FOLD_NUM, random_state=42)
scores = []
feature_importance_df = pd.DataFrame()
pred_cv = np.zeros(len(test.index))
num_round = 10000
for i, (tdx, vdx) in enumerate(kf.split(X, y)):
print(f'Fold : {i}')
X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y_values[tdx], y_values[vdx]
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)
model = lgb.train(params, lgb_train, num_boost_round=num_round,
#categorical_feature=cat_features,
valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
early_stopping_rounds=10)
va_pred = model.predict(X_valid)
score_ = mean_squared_error(y_valid.values, va_pred)
print(score_)
scores.append(score_)
models.append(model)
submission = model.predict(test.drop("id", axis=1), num_iteration=model.best_iteration)
pred_cv += submission/FOLD_NUM
print(np.mean(scores))
iddf = test[["id"]]
submission_df = pd.concat([iddf, pd.DataFrame(pred_cv)], axis=1)
submission_df.columns = ["id", "y"]
submission_df.to_csv("submission.csv", index=False)
print("end")
!ls
!pwd
diffg_data.csv sample_data.gsheet salary_estimation_prediction_bk20191206.ipynb submission.csv salary_estimation_prediction.ipynb test_data.csv sample_data.csv train_data.csv /content/drive/My Drive/予測コンペ/ProbSpace/給与推定コンペ
attr2 = {k: v for k, v in zip(X_values.columns, model.feature_importance()) if v>0}
attr2 = sorted(attr2.items(), key=lambda x: x[1], reverse = False)
x1,y1 = zip(*attr2)
i1=range(len(x1))
plt.figure(num=None, figsize=(9, 7), dpi=100, facecolor='w', edgecolor='k')
plt.barh(i1, y1)
plt.title("LGBM")
plt.yticks(i1, x1)
plt.show();
from matplotlib import pyplot as plt
# yyplot 作成関数
def yyplot(y_obs, y_pred):
yvalues = np.concatenate([y_obs.flatten(), y_pred.flatten()])
ymin, ymax, yrange = np.amin(yvalues), np.amax(yvalues), np.ptp(yvalues)
fig = plt.figure(figsize=(8, 8))
plt.scatter(y_obs, y_pred)
plt.plot([ymin - yrange * 0.01, ymax + yrange * 0.01], [ymin - yrange * 0.01, ymax + yrange * 0.01])
plt.xlim(ymin - yrange * 0.01, ymax + yrange * 0.01)
plt.ylim(ymin - yrange * 0.01, ymax + yrange * 0.01)
plt.xlabel('y_observed', fontsize=24)
plt.ylabel('y_predicted', fontsize=24)
plt.title('Observed-Predicted Plot', fontsize=24)
plt.tick_params(labelsize=16)
plt.show()
return fig
fig = yyplot(y_valid.values, va_pred)
# age salary position
train_try = train.copy()
feature = "position"
df1 = train_try[train_try[feature]==0]
plt.scatter(df1["age"], df1["salary"], label='', alpha=.1)
df2 = train_try[train_try[feature]==1]
plt.scatter(df2["age"], df2["salary"], label='', alpha=.1)
df3 = train_try[train_try[feature]==2]
plt.scatter(df3["age"], df3["salary"], label='', alpha=.1)
df4 = train_try[train_try[feature]==3]
plt.scatter(df4["age"], df4["salary"], label='', alpha=.1)
fs = 22
plt.legend(fontsize=15,loc='lower right')
plt.xlabel('commute',fontsize=fs)
plt.ylabel('salary',fontsize=fs)
plt.title("4 types of 'commute - salary'", fontsize=fs)
plt.tick_params(labelsize=fs)
No handles with labels found to put in legend.
# salary age
train.plot.scatter(x='age', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa415320>
# salary num_child
train.plot.scatter(x='num_child', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa20f940>
# salary service_length
train.plot.scatter(x='service_length', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13a81c8d30>
# salary study_time
train.plot.scatter(x='study_time', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa3a3c18>
# https://prob.space/competitions/salary-prediction/discussions/hiroki-Post95df6ed0b946a79c7d8b 参考
train_test = train.copy()
lambda x : x
train_test.loc[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==1), "commute"] = \
train_test[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==1)]["commute"].apply(lambda x : max(x/2 - 1, 0))
train_test.loc[((train_test["area"]!="東京都") | (train_test["area"]!="大阪府")) & (train_test["partner"]==1), "commute"] = \
train_test[((train_test["area"]!="東京都") | (train_test["area"]!="大阪府")) & (train_test["partner"]==1)]["commute"].apply(lambda x : max(x-1, 0))
train_test.loc[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==0), "commute"] = \
train_test[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==0)]["commute"].apply(lambda x : max(x-1, 0))
train_test.plot.scatter(x='commute', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa03ee48>
# salary overtime
train.plot.scatter(x='overtime', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa25ab38>
hirayuki
あとから気づいたのですが le.fit(df[column]) のタイミングが非常によろしくないです。
le.fit(X[column]) としてfor文の手前に持ってくるべきでした。すみません。