hirayuki

LightGBMとOptunaと思いつきの特徴量生成

参考資料

Kaggleで勝つデータ分析の技術
- https://www.amazon.co.jp/Kaggle%E3%81%A7%E5%8B%9D%E3%81%A4%E3%83%87%E3%83%BC%E3%82%BF%E5%88%86%E6%9E%90%E3%81%AE%E6%8A%80%E8%A1%93-%E9%96%80%E8%84%87-%E5%A4%A7%E8%BC%94/dp/4297108437
commute - salary 分布の重なり
- https://prob.space/competitions/salary-prediction/discussions/hiroki-Post95df6ed0b946a79c7d8b
Parameters — LightGBM (2.3.2) documentation
- https://lightgbm.readthedocs.io/en/latest/Parameters.html
LightGBM 徹底入門 – LightGBMの使い方や仕組み、XGBoostとの違いについて
- https://www.codexa.net/lightgbm-beginner/

[動作環境] Google colaboratory

※lightgbmのコードを書いたのは人生で2度目です。ところどころ初心者の書き方をしているかもしれないので、お手本というよりはご参考までに見ていただけると幸いです。

from google.colab import drive
drive.mount('/content/drive/')

# ご自身の環境に合わせてください
%cd /content/drive/My\ Drive/予測コンペ/ProbSpace/給与推定コンペ

!pip install optuna

import featuretools as ft
import lightgbm as lgb
import optuna
import numpy as np
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

データの読み込み

train = pd.read_csv("train_data.csv")
y = train[["id", "salary"]]
X = train.drop("salary", axis=1).drop("id", axis=1)

print("train shape is " + str(train.shape))
print("target shape is " + str(y.shape))

test = pd.read_csv("test_data.csv")
print("test shape is " + str(test.shape))

前処理

cat_features = ["position", "sex", "partner", "education", "area"]
num_features = ["age", "num_child", "service_length", "study_time", "commute", "overtime"]

# 全て本やブログからの思いつきです
def create_features(df):
  def overtime_zeto2median(df):
    # 不自然に残業時間0の層があるため、欠損値的な扱いをして給与の中央値を入れる
    df_ex_zero = df[df["overtime"]>0][["overtime"]]
    df.loc[df["overtime"] <= 0, "overtime"] = df_ex_zero["overtime"].median()
    return df

  def live_in_city(df):
    df["isCity"] = df["area"].isin(["東京都", "大阪府"]).astype(int)
    if "isCity" not in cat_features:
      cat_features.append("isCity")
    return df

  def sex_and_position(df):
    df["sex_and_position"] = df["sex"].astype(str) + "_" + df["position"].astype(str)
    if "sex_and_position" not in cat_features:
      cat_features.append("sex_and_position")
    return df

  def age_layer(df):
    df["age_layer"] = 0
    df.loc[20>=df["age"], "age_layer"] = 0
    df.loc[(30>=df["age"])&(df["age"]>20), "age_layer"] = 1
    df.loc[df["age"]>30, "age_layer"] = 2
    if "age_layer" not in cat_features:
      cat_features.append("age_layer")
    return df
  
  def agelayer_and_position(df):
    df["agelayer_and_position"] = df["age_layer"].astype(str) + "_" + df["position"].astype(str)
    if "agelayer_and_position" not in cat_features:
      cat_features.append("agelayer_and_position")
    return df

  def education_and_position(df):
    df["education_and_position"] = df["education"].astype(str) + "_" + df["position"].astype(str)
    if "education_and_position" not in cat_features:
      cat_features.append("education_and_position")
    return df

  def age_diff_service_length(df):
    df["age_diff_service_length"] = df["age"] / (df["service_length"]+1)
    if "age_diff_service_length" not in num_features:
      num_features.append("age_diff_service_length")
    return df

  def adjust_commute(df):
    df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1), "commute"] = \
      df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x/2 - 1, 0))
    df.loc[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1), "commute"] = \
      df[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x-1, 0))
    df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0), "commute"] = \
      df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0)]["commute"].apply(lambda x : max(x-1, 0))
    return df
  
  def overtime_by_service_length(df):
    df["overtime_by_service_length"] = df["overtime"] / (df["service_length"]+1)
    if "overtime_by_service_length" not in num_features:
      num_features.append("overtime_by_service_length")
    return df

  def overtime_by_age(df):
    df["overtime_by_age"] = df["overtime"] / df["age"]
    if "overtime_by_age" not in num_features:
      num_features.append("overtime_by_age")
    return df

  def study_time_by_service_length(df):
    df["study_time_by_service_length"] = df["study_time"] / (df["service_length"]+1)
    if "study_time_by_service_length" not in num_features:
      num_features.append("study_time_by_service_length")
    return df
  
  def study_time_by_age(df):
    df["study_time_by_age"] = df["study_time"] / df["age"]
    if "study_time_by_age" not in num_features:
      num_features.append("study_time_by_age")
    return df
  
  def age_by_service_length(df):
    df["age_by_service_length"] = df["age"] / (df["service_length"]+1)
    if "age_by_service_length" not in num_features:
      num_features.append("age_by_service_length")
    return df

  df = overtime_zeto2median(df)
  df = live_in_city(df)
  df = sex_and_position(df)
  df = education_and_position(df)
  df = adjust_commute(df)
  df = age_diff_service_length(df)
  df = age_layer(df)
  df = agelayer_and_position(df)
  df = overtime_by_service_length(df)
  df = overtime_by_age(df)
  df = study_time_by_service_length(df)
  df = study_time_by_age(df)
  df = age_by_service_length(df)
  return df

for df in [X, test]:
  df = create_features(df)

scalar = StandardScaler()
scalar.fit(X[num_features])
for df in [X, test]:
  df[num_features] = scalar.transform(df[num_features])
  le = preprocessing.LabelEncoder()  # あとでtarget encodingするので不要？
  for column in cat_features:
    le.fit(df[column])
    label_encoded_column = le.transform(df[column])
    df[column] = pd.Series(label_encoded_column).astype('category')

target = y["salary"]

for c in cat_features:
  # 学習データ全体で、各カテゴリの置けるtargetの平均を計算
  data_tmp = pd.DataFrame({c: X[c], "target": target})
  target_mean = data_tmp.groupby(c)["target"].mean()
  # テストデータのカテゴリを置換
  test[c] = test[c].map(target_mean).astype(np.float)

  # 学習データの変換後の値を格納する配列を準備
  tmp = np.repeat(np.nan, X.shape[0])

  # 学習データを分割
  kf = KFold(n_splits=4, shuffle=True, random_state=42)
  for idx_1, idx_2 in kf.split(X):
    target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()
    tmp[idx_2] = X[c].iloc[idx_2].map(target_mean)
  X[c] = tmp

Optuna + 交差検証＋targetencoding

y_values = y["salary"]

def objective(trial):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2'},
        'verbosity': -1,
        "seed":42,
        "learning_rate":trial.suggest_loguniform('lambda_l1', 0.005, 0.03),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    num_round = 10000
    FOLD_NUM = 5

    models = []
    kf = KFold(n_splits=FOLD_NUM, random_state=42)
    scores = []
    feature_importance_df = pd.DataFrame()


    pred_cv = np.zeros(len(test.index))

    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y_values[tdx], y_values[vdx]
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid)
        model = lgb.train(params, lgb_train, num_boost_round=num_round,
                      #categorical_feature=cat_features,
                      valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                      early_stopping_rounds=10)
        va_pred = model.predict(X_valid)
        score_ = -mean_squared_error(y_valid.values, va_pred)  # 改良の余地あり
        print(score_)
        scores.append(score_)
        models.append(model)

    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# 結果の確認
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    "{}": {},'.format(key, value))

# Optunaの最適化パラメータを代入する
params = {'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2'},
        'verbosity': -1,
        "seed":42,}
params.update(trial.params)
params

models = []
FOLD_NUM = 5
kf = KFold(n_splits=FOLD_NUM, random_state=42)
scores = []
feature_importance_df = pd.DataFrame()

pred_cv = np.zeros(len(test.index))
num_round = 10000


for i, (tdx, vdx) in enumerate(kf.split(X, y)):
    print(f'Fold : {i}')
    X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y_values[tdx], y_values[vdx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    model = lgb.train(params, lgb_train, num_boost_round=num_round,
                  #categorical_feature=cat_features,
                  valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                  early_stopping_rounds=10)
    va_pred = model.predict(X_valid)
    score_ = mean_squared_error(y_valid.values, va_pred)
    print(score_)
    scores.append(score_)
    models.append(model)

    submission = model.predict(test.drop("id", axis=1), num_iteration=model.best_iteration) 
    pred_cv += submission/FOLD_NUM

print(np.mean(scores))
iddf = test[["id"]]
submission_df = pd.concat([iddf, pd.DataFrame(pred_cv)], axis=1)
submission_df.columns = ["id", "y"]
submission_df.to_csv("submission.csv", index=False)
print("end")

終了

!ls
!pwd

diffg_data.csv				       sample_data.gsheet
salary_estimation_prediction_bk20191206.ipynb  submission.csv
salary_estimation_prediction.ipynb	       test_data.csv
sample_data.csv				       train_data.csv
/content/drive/My Drive/予測コンペ/ProbSpace/給与推定コンペ

おまけ

attr2 = {k: v for k, v in zip(X_values.columns, model.feature_importance()) if v>0}
attr2 = sorted(attr2.items(), key=lambda x: x[1], reverse = False)
x1,y1 = zip(*attr2)
i1=range(len(x1))
plt.figure(num=None, figsize=(9, 7), dpi=100, facecolor='w', edgecolor='k')
plt.barh(i1, y1)
plt.title("LGBM")
plt.yticks(i1, x1)
plt.show();

from matplotlib import pyplot as plt

# yyplot 作成関数
def yyplot(y_obs, y_pred):
    yvalues = np.concatenate([y_obs.flatten(), y_pred.flatten()])
    ymin, ymax, yrange = np.amin(yvalues), np.amax(yvalues), np.ptp(yvalues)
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(y_obs, y_pred)
    plt.plot([ymin - yrange * 0.01, ymax + yrange * 0.01], [ymin - yrange * 0.01, ymax + yrange * 0.01])
    plt.xlim(ymin - yrange * 0.01, ymax + yrange * 0.01)
    plt.ylim(ymin - yrange * 0.01, ymax + yrange * 0.01)
    plt.xlabel('y_observed', fontsize=24)
    plt.ylabel('y_predicted', fontsize=24)
    plt.title('Observed-Predicted Plot', fontsize=24)
    plt.tick_params(labelsize=16)
    plt.show()

    return fig

fig = yyplot(y_valid.values, va_pred)

EDA

# age salary position
train_try = train.copy()

feature = "position"

df1 = train_try[train_try[feature]==0]
plt.scatter(df1["age"], df1["salary"], label='', alpha=.1)

df2 = train_try[train_try[feature]==1]
plt.scatter(df2["age"], df2["salary"], label='', alpha=.1)

df3 = train_try[train_try[feature]==2]
plt.scatter(df3["age"], df3["salary"], label='', alpha=.1)

df4 = train_try[train_try[feature]==3]
plt.scatter(df4["age"], df4["salary"], label='', alpha=.1)

fs = 22
plt.legend(fontsize=15,loc='lower right')
plt.xlabel('commute',fontsize=fs)
plt.ylabel('salary',fontsize=fs)
plt.title("4 types of 'commute - salary'", fontsize=fs)
plt.tick_params(labelsize=fs)

No handles with labels found to put in legend.

# salary age
train.plot.scatter(x='age', y='salary', alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa415320>

# salary num_child
train.plot.scatter(x='num_child', y='salary', alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa20f940>

# salary service_length
train.plot.scatter(x='service_length', y='salary', alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f13a81c8d30>

# salary study_time
train.plot.scatter(x='study_time', y='salary', alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa3a3c18>

# https://prob.space/competitions/salary-prediction/discussions/hiroki-Post95df6ed0b946a79c7d8b 参考
train_test = train.copy()

lambda x : x

train_test.loc[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==1), "commute"] = \
   train_test[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==1)]["commute"].apply(lambda x : max(x/2 - 1, 0))
train_test.loc[((train_test["area"]!="東京都") | (train_test["area"]!="大阪府")) & (train_test["partner"]==1), "commute"] = \
   train_test[((train_test["area"]!="東京都") | (train_test["area"]!="大阪府")) & (train_test["partner"]==1)]["commute"].apply(lambda x : max(x-1, 0))
train_test.loc[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==0), "commute"] = \
   train_test[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==0)]["commute"].apply(lambda x : max(x-1, 0))
train_test.plot.scatter(x='commute', y='salary', alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa03ee48>

# salary overtime
train.plot.scatter(x='overtime', y='salary', alpha=0.1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa25ab38>

添付データ

salary_estimation_prediction.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241222T052030Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ

LightGBMとOptunaと思いつきの特徴量生成

参考資料

データの読み込み

前処理

Optuna + 交差検証＋targetencoding

終了

おまけ

EDA

添付データ

hirayuki

new user