LightGBMとOptunaと思いつきの特徴量生成

[動作環境] Google colaboratory

※lightgbmのコードを書いたのは人生で2度目です。ところどころ初心者の書き方をしているかもしれないので、お手本というよりはご参考までに見ていただけると幸いです。

from google.colab import drive
drive.mount('/content/drive/')
# ご自身の環境に合わせてください
%cd /content/drive/My\ Drive/予測コンペ/ProbSpace/給与推定コンペ
!pip install optuna
import featuretools as ft
import lightgbm as lgb
import optuna
import numpy as np
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

データの読み込み

train = pd.read_csv("train_data.csv")
y = train[["id", "salary"]]
X = train.drop("salary", axis=1).drop("id", axis=1)

print("train shape is " + str(train.shape))
print("target shape is " + str(y.shape))

test = pd.read_csv("test_data.csv")
print("test shape is " + str(test.shape))

前処理

cat_features = ["position", "sex", "partner", "education", "area"]
num_features = ["age", "num_child", "service_length", "study_time", "commute", "overtime"]

# 全て本やブログからの思いつきです
def create_features(df):
  def overtime_zeto2median(df):
    # 不自然に残業時間0の層があるため、欠損値的な扱いをして給与の中央値を入れる
    df_ex_zero = df[df["overtime"]>0][["overtime"]]
    df.loc[df["overtime"] <= 0, "overtime"] = df_ex_zero["overtime"].median()
    return df

  def live_in_city(df):
    df["isCity"] = df["area"].isin(["東京都", "大阪府"]).astype(int)
    if "isCity" not in cat_features:
      cat_features.append("isCity")
    return df

  def sex_and_position(df):
    df["sex_and_position"] = df["sex"].astype(str) + "_" + df["position"].astype(str)
    if "sex_and_position" not in cat_features:
      cat_features.append("sex_and_position")
    return df

  def age_layer(df):
    df["age_layer"] = 0
    df.loc[20>=df["age"], "age_layer"] = 0
    df.loc[(30>=df["age"])&(df["age"]>20), "age_layer"] = 1
    df.loc[df["age"]>30, "age_layer"] = 2
    if "age_layer" not in cat_features:
      cat_features.append("age_layer")
    return df
  
  def agelayer_and_position(df):
    df["agelayer_and_position"] = df["age_layer"].astype(str) + "_" + df["position"].astype(str)
    if "agelayer_and_position" not in cat_features:
      cat_features.append("agelayer_and_position")
    return df

  def education_and_position(df):
    df["education_and_position"] = df["education"].astype(str) + "_" + df["position"].astype(str)
    if "education_and_position" not in cat_features:
      cat_features.append("education_and_position")
    return df

  def age_diff_service_length(df):
    df["age_diff_service_length"] = df["age"] / (df["service_length"]+1)
    if "age_diff_service_length" not in num_features:
      num_features.append("age_diff_service_length")
    return df

  def adjust_commute(df):
    df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1), "commute"] = \
      df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x/2 - 1, 0))
    df.loc[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1), "commute"] = \
      df[((df["area"]!="東京都") | (df["area"]!="大阪府")) & (df["partner"]==1)]["commute"].apply(lambda x : max(x-1, 0))
    df.loc[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0), "commute"] = \
      df[((df["area"]=="東京都") | (df["area"]=="大阪府")) & (df["partner"]==0)]["commute"].apply(lambda x : max(x-1, 0))
    return df
  
  def overtime_by_service_length(df):
    df["overtime_by_service_length"] = df["overtime"] / (df["service_length"]+1)
    if "overtime_by_service_length" not in num_features:
      num_features.append("overtime_by_service_length")
    return df

  def overtime_by_age(df):
    df["overtime_by_age"] = df["overtime"] / df["age"]
    if "overtime_by_age" not in num_features:
      num_features.append("overtime_by_age")
    return df

  def study_time_by_service_length(df):
    df["study_time_by_service_length"] = df["study_time"] / (df["service_length"]+1)
    if "study_time_by_service_length" not in num_features:
      num_features.append("study_time_by_service_length")
    return df
  
  def study_time_by_age(df):
    df["study_time_by_age"] = df["study_time"] / df["age"]
    if "study_time_by_age" not in num_features:
      num_features.append("study_time_by_age")
    return df
  
  def age_by_service_length(df):
    df["age_by_service_length"] = df["age"] / (df["service_length"]+1)
    if "age_by_service_length" not in num_features:
      num_features.append("age_by_service_length")
    return df

  df = overtime_zeto2median(df)
  df = live_in_city(df)
  df = sex_and_position(df)
  df = education_and_position(df)
  df = adjust_commute(df)
  df = age_diff_service_length(df)
  df = age_layer(df)
  df = agelayer_and_position(df)
  df = overtime_by_service_length(df)
  df = overtime_by_age(df)
  df = study_time_by_service_length(df)
  df = study_time_by_age(df)
  df = age_by_service_length(df)
  return df

for df in [X, test]:
  df = create_features(df)

scalar = StandardScaler()
scalar.fit(X[num_features])
for df in [X, test]:
  df[num_features] = scalar.transform(df[num_features])
  le = preprocessing.LabelEncoder()  # あとでtarget encodingするので不要?
  for column in cat_features:
    le.fit(df[column])
    label_encoded_column = le.transform(df[column])
    df[column] = pd.Series(label_encoded_column).astype('category')

target = y["salary"]

for c in cat_features:
  # 学習データ全体で、各カテゴリの置けるtargetの平均を計算
  data_tmp = pd.DataFrame({c: X[c], "target": target})
  target_mean = data_tmp.groupby(c)["target"].mean()
  # テストデータのカテゴリを置換
  test[c] = test[c].map(target_mean).astype(np.float)

  # 学習データの変換後の値を格納する配列を準備
  tmp = np.repeat(np.nan, X.shape[0])

  # 学習データを分割
  kf = KFold(n_splits=4, shuffle=True, random_state=42)
  for idx_1, idx_2 in kf.split(X):
    target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()
    tmp[idx_2] = X[c].iloc[idx_2].map(target_mean)
  X[c] = tmp

Optuna + 交差検証+targetencoding

y_values = y["salary"]

def objective(trial):
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2'},
        'verbosity': -1,
        "seed":42,
        "learning_rate":trial.suggest_loguniform('lambda_l1', 0.005, 0.03),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    num_round = 10000
    FOLD_NUM = 5

    models = []
    kf = KFold(n_splits=FOLD_NUM, random_state=42)
    scores = []
    feature_importance_df = pd.DataFrame()


    pred_cv = np.zeros(len(test.index))

    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y_values[tdx], y_values[vdx]
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid)
        model = lgb.train(params, lgb_train, num_boost_round=num_round,
                      #categorical_feature=cat_features,
                      valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                      early_stopping_rounds=10)
        va_pred = model.predict(X_valid)
        score_ = -mean_squared_error(y_valid.values, va_pred)  # 改良の余地あり
        print(score_)
        scores.append(score_)
        models.append(model)

    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# 結果の確認
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    "{}": {},'.format(key, value))
# Optunaの最適化パラメータを代入する
params = {'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2'},
        'verbosity': -1,
        "seed":42,}
params.update(trial.params)
params
models = []
FOLD_NUM = 5
kf = KFold(n_splits=FOLD_NUM, random_state=42)
scores = []
feature_importance_df = pd.DataFrame()

pred_cv = np.zeros(len(test.index))
num_round = 10000


for i, (tdx, vdx) in enumerate(kf.split(X, y)):
    print(f'Fold : {i}')
    X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y_values[tdx], y_values[vdx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    model = lgb.train(params, lgb_train, num_boost_round=num_round,
                  #categorical_feature=cat_features,
                  valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                  early_stopping_rounds=10)
    va_pred = model.predict(X_valid)
    score_ = mean_squared_error(y_valid.values, va_pred)
    print(score_)
    scores.append(score_)
    models.append(model)

    submission = model.predict(test.drop("id", axis=1), num_iteration=model.best_iteration) 
    pred_cv += submission/FOLD_NUM

print(np.mean(scores))
iddf = test[["id"]]
submission_df = pd.concat([iddf, pd.DataFrame(pred_cv)], axis=1)
submission_df.columns = ["id", "y"]
submission_df.to_csv("submission.csv", index=False)
print("end")

終了

!ls
!pwd
diffg_data.csv				       sample_data.gsheet
salary_estimation_prediction_bk20191206.ipynb  submission.csv
salary_estimation_prediction.ipynb	       test_data.csv
sample_data.csv				       train_data.csv
/content/drive/My Drive/予測コンペ/ProbSpace/給与推定コンペ

おまけ

attr2 = {k: v for k, v in zip(X_values.columns, model.feature_importance()) if v>0}
attr2 = sorted(attr2.items(), key=lambda x: x[1], reverse = False)
x1,y1 = zip(*attr2)
i1=range(len(x1))
plt.figure(num=None, figsize=(9, 7), dpi=100, facecolor='w', edgecolor='k')
plt.barh(i1, y1)
plt.title("LGBM")
plt.yticks(i1, x1)
plt.show();
from matplotlib import pyplot as plt

# yyplot 作成関数
def yyplot(y_obs, y_pred):
    yvalues = np.concatenate([y_obs.flatten(), y_pred.flatten()])
    ymin, ymax, yrange = np.amin(yvalues), np.amax(yvalues), np.ptp(yvalues)
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(y_obs, y_pred)
    plt.plot([ymin - yrange * 0.01, ymax + yrange * 0.01], [ymin - yrange * 0.01, ymax + yrange * 0.01])
    plt.xlim(ymin - yrange * 0.01, ymax + yrange * 0.01)
    plt.ylim(ymin - yrange * 0.01, ymax + yrange * 0.01)
    plt.xlabel('y_observed', fontsize=24)
    plt.ylabel('y_predicted', fontsize=24)
    plt.title('Observed-Predicted Plot', fontsize=24)
    plt.tick_params(labelsize=16)
    plt.show()

    return fig

fig = yyplot(y_valid.values, va_pred)

EDA

# age salary position
train_try = train.copy()

feature = "position"

df1 = train_try[train_try[feature]==0]
plt.scatter(df1["age"], df1["salary"], label='', alpha=.1)

df2 = train_try[train_try[feature]==1]
plt.scatter(df2["age"], df2["salary"], label='', alpha=.1)

df3 = train_try[train_try[feature]==2]
plt.scatter(df3["age"], df3["salary"], label='', alpha=.1)

df4 = train_try[train_try[feature]==3]
plt.scatter(df4["age"], df4["salary"], label='', alpha=.1)

fs = 22
plt.legend(fontsize=15,loc='lower right')
plt.xlabel('commute',fontsize=fs)
plt.ylabel('salary',fontsize=fs)
plt.title("4 types of 'commute - salary'", fontsize=fs)
plt.tick_params(labelsize=fs)
No handles with labels found to put in legend.
# salary age
train.plot.scatter(x='age', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa415320>
# salary num_child
train.plot.scatter(x='num_child', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa20f940>
# salary service_length
train.plot.scatter(x='service_length', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13a81c8d30>
# salary study_time
train.plot.scatter(x='study_time', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa3a3c18>
# https://prob.space/competitions/salary-prediction/discussions/hiroki-Post95df6ed0b946a79c7d8b 参考
train_test = train.copy()

lambda x : x

train_test.loc[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==1), "commute"] = \
   train_test[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==1)]["commute"].apply(lambda x : max(x/2 - 1, 0))
train_test.loc[((train_test["area"]!="東京都") | (train_test["area"]!="大阪府")) & (train_test["partner"]==1), "commute"] = \
   train_test[((train_test["area"]!="東京都") | (train_test["area"]!="大阪府")) & (train_test["partner"]==1)]["commute"].apply(lambda x : max(x-1, 0))
train_test.loc[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==0), "commute"] = \
   train_test[((train_test["area"]=="東京都") | (train_test["area"]=="大阪府")) & (train_test["partner"]==0)]["commute"].apply(lambda x : max(x-1, 0))
train_test.plot.scatter(x='commute', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa03ee48>
# salary overtime
train.plot.scatter(x='overtime', y='salary', alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f13aa25ab38>

添付データ

  • salary_estimation_prediction.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241222T052030Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Aws4 request&x amz signedheaders=host&x amz signature=43ee6a6cc8a97790b3fb5f4d723baa19f4513e849c70bcec6776fe347a144b8f
    hirayuki

    あとから気づいたのですが le.fit(df[column]) のタイミングが非常によろしくないです。

    le.fit(X[column]) としてfor文の手前に持ってくるべきでした。すみません。

    Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。