退会したユーザー
# mount drive
from google.colab import drive
drive.mount('/content/drive')
# set configulation
import os
class Config():
root_path = '/content/drive/MyDrive/competition/ProbSpace/us_stock_price'
input_path = os.path.join(root_path, 'input')
model_path = os.path.join(root_path, 'model')
result_path = os.path.join(root_path, 'result')
seed = 42
# create dirs
for dir in [Config.model_path, Config.result_path]:
os.makedirs(dir, exist_ok=True)
import pandas as pd
import numpy as np
import random
import matplotlib.pylab as plt
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
pd.set_option('max_columns', 50)
plt.style.use('bmh')
def seed_everything(seed=2021):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
seed_everything(Config.seed)
train_df = pd.read_csv(os.path.join(Config.input_path, 'train_data.csv'))
print(train_df.shape)
display(train_df)
train_df.info()
print('NaNの合計:', train_df.isnull().sum().sum())
print('NaNを含む行', train_df.index[train_df.isnull().any(axis=1)].to_list())
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df = train_df.dropna().set_index('Date')
display(train_df)
train_df.info()
株価は、対数価格の増減が正規分布に従うそうですので、検証します
まずは、対数価格を見てみます
train_df = train_df.apply(np.log1p)
プロットしてみる
cols = ['VGSH', 'JEF', 'IVR']
train_df[cols].plot(figsize=(15,5))
train_df[cols].plot(
subplots=True, kind='hist', bins=100,
figsize=(15,10)
)
対数価格そのものは正規分布から外れている
def qqplot(dist):
plt.figure(figsize=(5,5))
stats.probplot(dist, dist='norm', plot=plt)
plt.show()
for col in cols:
qqplot(train_df[col])
1週前からの増減をプロットしてみる
train_df[cols].diff(1).fillna(0).plot(figsize=(15,5))
train_df[cols].diff(1).fillna(0).plot(
subplots=True, kind='hist', bins=100,
figsize=(15,10)
)
正規分布に近づいた
for col in cols:
qqplot(train_df[col].diff(1).fillna(0))
銘柄ごとに分散が異なるので標準偏差で割ったものをプロットしてみる
train_df[cols].diff(1).fillna(0).apply(lambda x: x / x.std()).plot(
figsize=(15,5)
)
train_df[cols].diff(1).fillna(0).apply(lambda x: x / x.std()).plot(
subplots=True, kind='hist', bins=100,
figsize=(15,10)
)
年ごとの平均株価
df = pd.Series(index=range(2012,2019+1), dtype=np.float64)
for y in df.index:
df[y] = train_df.loc[train_df.index.year == y].mean().mean()
df.plot(figsize=(15,10))
月ごとの平均株価
df = pd.Series(index=range(1,12+1), dtype=np.float64)
for m in df.index:
df[m] = train_df.loc[train_df.index.month == m].mean().mean()
df.plot(figsize=(15,10))
週ごとの平均株価
df = pd.Series(index=range(1,52+1), dtype=np.float64)
for w in df.index:
df[w] = train_df.loc[train_df.index.isocalendar().week == w].mean().mean()
df.plot(figsize=(15,10))
company_list.csvを読み込む
company_df = pd.read_csv(os.path.join(Config.input_path, 'company_list.csv')).rename(columns={'Symbol':'id'})
print(company_df.shape)
表示してみる
display(company_df)
company_df.info()
company_dfに含まれない銘柄
not_exist = list(train_df.columns[~train_df.columns.isin(company_df['id'])])
print(not_exist)
とりあえずダミー追加
for col in not_exist:
company_df = company_df.append({'id':col}, ignore_index=True)
なんか多い
company_df = company_df[company_df['id'].isin(train_df.columns)]
print(len(company_df))
重複してるのを表示
company_df[company_df.duplicated(subset='id', keep=False)].sort_values('id')
重複は最大2個でListだけが異なっているので、Listを2列にする
company_df['List1'] = company_df[['id', 'List']].groupby('id').transform(lambda x: x.iloc[0])
company_df['List2'] = company_df[['id', 'List']].groupby('id').transform(lambda x: x.iloc[-1])
company_df = company_df.drop('List', axis=1).drop_duplicates(subset='id').reset_index(drop=True)
display(company_df)
company_df.info()
各Sectorの銘柄数
company_df['Sector'].fillna('nothing', inplace=True)
company_df['Sector'].value_counts().plot(kind='bar', figsize=(15,10))
Sectorごとの平均株価
tmp_df = pd.DataFrame(columns=company_df['Sector'].value_counts().index)
for sector in tmp_df.columns:
tmp_df[sector] = train_df[company_df.loc[company_df['Sector'] == sector, 'id']].mean(axis=1)
tmp_df.plot(figsize=(15,10))
各Industryの銘柄数(上位10種類)
company_df['Industry'].fillna('nothing', inplace=True)
company_df['Industry'].value_counts()[:10].plot(kind='bar', figsize=(15,10))
Industryごとの平均株価(上位10種類)
tmp_df = pd.DataFrame(columns=company_df['Industry'].value_counts().index[:10])
for sector in tmp_df.columns:
tmp_df[sector] = train_df[company_df.loc[company_df['Industry'] == sector, 'id']].mean(axis=1)
tmp_df.plot(figsize=(15,10))
各Listの銘柄数
company_df['List1'].fillna('nothing', inplace=True)
company_df['List1'].value_counts().plot(kind='bar', figsize=(15,10))
Listごとの平均株価
tmp_df = pd.DataFrame(columns=company_df['List1'].value_counts().index)
for sector in tmp_df.columns:
tmp_df[sector] = train_df[company_df.loc[company_df['List1'] == sector, 'id']].mean(axis=1)
tmp_df.plot(figsize=(15,10))
予測日を追加し日付情報を取り出し
train_df.loc[pd.to_datetime('2019-11-24'), :] = np.nan
train_date = pd.Series(train_df.index)
train_df.reset_index(drop=True, inplace=True)
各行が1銘柄1週分のデータになるよう変形
train_df = train_df.T.reset_index().rename(columns={'index': 'id'})
train_df = pd.melt(
train_df, id_vars='id', value_vars=[week for week in range(420)],
var_name='Week', value_name='y'
)
display(train_df)
train_df.info()
目的変数作成
train_df['y_prev'] = train_df[['id', 'y']].groupby('id')['y'].transform(lambda x: x.shift(1).fillna(method='bfill'))
train_df['y_diff'] = train_df['y'] - train_df['y_prev']
train_df['y_diff_std'] = train_df[['id', 'y']].groupby('id')['y'].transform(lambda x: x.std())
train_df['y_diff_norm'] = train_df['y_diff'] / train_df['y_diff_std']
train_df['Year'] = train_df['Week'].map(train_date).dt.year
train_df['Month'] = train_df['Week'].map(train_date).dt.month
train_df['Day'] = train_df['Week'].map(train_date).dt.day
train_df['WeekOfYear'] = train_df['Week'].map(train_date).dt.isocalendar().week.astype(int)
company_dfをマージ
train_df = pd.merge(train_df, company_df, on='id', how='left')
display(train_df)
train_df.info()
ラベルエンコーディング
train_df['enc_Sector'] = LabelEncoder().fit_transform(train_df['Sector'].fillna('nothing'))
train_df['enc_Industry'] = LabelEncoder().fit_transform(train_df['Industry'].fillna('nothing'))
train_df['enc_List1'] = LabelEncoder().fit_transform(train_df['List1'].fillna('nothing'))
train_df['enc_List2'] = LabelEncoder().fit_transform(train_df['List2'].fillna('nothing'))
lag
def create_lags(df, group_col, val_col, lags):
lag_df = pd.DataFrame()
for lag in lags:
lag_df[f'lag_{lag}_{val_col}'] = df[[group_col, val_col]].groupby(
group_col)[val_col].transform(lambda x: x.shift(lag))
return lag_df
lag_df = create_lags(train_df, 'id', 'y_diff_norm', [1,2,3,4])
display(lag_df)
lag_df.info()
rolling mean/std
def create_rolls(df, group_col, val_col, lags, rolls):
roll_df = pd.DataFrame()
for lag in lags:
for roll in rolls:
roll_df[f'rmean_{lag}_{val_col}'] = df[[group_col, val_col]].groupby(
group_col)[val_col].transform(lambda x: x.shift(lag).rolling(roll).mean())
roll_df[f'rstd_{lag}_{val_col}'] = df[[group_col, val_col]].groupby(
group_col)[val_col].transform(lambda x: x.shift(lag).rolling(roll).std())
return roll_df
roll_df = create_rolls(train_df, 'id', 'y_diff_norm', [1], [4, 9, 13, 26, 52])
display(roll_df)
roll_df.info()
lag_df/roll_dfをマージ
train_df = pd.concat([train_df, lag_df, roll_df], axis=1)
display(train_df)
train_df.info()
学習準備
test_df = train_df.loc[train_df['Week'] == 419].reset_index(drop=True)
train_df = train_df.loc[train_df['Week'] < 419].reset_index(drop=True)
useless_cols = [
'y', 'y_prev', 'y_diff', 'y_diff_std', 'y_diff_norm',
'Week', 'id', 'Name', 'Sector', 'Industry', 'List1', 'List2'
]
usable_cols = train_df.columns[~train_df.columns.isin(useless_cols)]
target_col = 'y_diff_norm'
x_train = train_df[usable_cols]
y_train = train_df[target_col]
x_test = test_df[usable_cols]
lightGBMパラメータ
lgb_params = {
'objective': 'regression',
'importance_type': 'gain',
'metric': 'rmse',
'seed': Config.seed,
'n_jobs': -1,
'verbose': -1,
'n_estimators': 500, 'learning_rate': 0.1,
'boosting_type': 'gbdt',
'subsample': 0.5, 'subsample_freq': 1,
'colsample_bytree': 0.5,
'num_leaves': 127, 'min_child_samples': 255,
'max_bin': 100,
}
学習/予測
y_diff_std = train_df['y_diff_std']
groups = train_df['id']
y_oof = np.zeros(len(y_train))
y_preds = []
kf = GroupKFold(n_splits=5)
for fold, (tr_idx, vl_idx) in enumerate(kf.split(x_train, y_train, groups)):
x_tr_fold = x_train.iloc[tr_idx]
y_tr_fold = y_train.iloc[tr_idx]
x_vl_fold = x_train.iloc[vl_idx]
y_vl_fold = y_train.iloc[vl_idx]
model = lgb.LGBMRegressor(**lgb_params)
model.fit(
x_tr_fold, y_tr_fold,
eval_set=(x_vl_fold, y_vl_fold),
eval_metric='rmse',
verbose=False,
early_stopping_rounds=100,
)
y_oof[vl_idx] = model.predict(x_vl_fold)
y_preds.append(model.predict(x_test))
print(
f'fold {fold} score:',
np.sqrt(np.mean(np.square((y_oof[vl_idx] - y_vl_fold) * y_diff_std[vl_idx])))
)
print(
'oof score:',
np.sqrt(np.mean(np.square((y_oof[vl_idx] - y_vl_fold) * y_diff_std[vl_idx])))
)
submissionファイル生成
submission_df = pd.read_csv(os.path.join(Config.input_path, 'submission_template.csv'))
submission_df['y'] = np.expm1(
np.mean(y_preds, axis=0) * test_df['y_diff_std'].values + test_df['y_prev'].values
)
submission_df.to_csv(os.path.join(Config.result_path, 'submission.csv'), index=False)