テクニカル分析を駆使して成長銘柄を予言しよう!
退会したユーザー
# mount drive from google.colab import drive drive.mount('/content/drive')
# set configulation import os class Config(): root_path = '/content/drive/MyDrive/competition/ProbSpace/us_stock_price' input_path = os.path.join(root_path, 'input') model_path = os.path.join(root_path, 'model') result_path = os.path.join(root_path, 'result') seed = 42
# create dirs for dir in [Config.model_path, Config.result_path]: os.makedirs(dir, exist_ok=True)
import pandas as pd import numpy as np import random import matplotlib.pylab as plt import scipy.stats as stats from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import GroupKFold import lightgbm as lgb
pd.set_option('max_columns', 50) plt.style.use('bmh')
def seed_everything(seed=2021): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) seed_everything(Config.seed)
train_df = pd.read_csv(os.path.join(Config.input_path, 'train_data.csv')) print(train_df.shape)
display(train_df) train_df.info()
print('NaNの合計:', train_df.isnull().sum().sum()) print('NaNを含む行', train_df.index[train_df.isnull().any(axis=1)].to_list())
train_df['Date'] = pd.to_datetime(train_df['Date']) train_df = train_df.dropna().set_index('Date') display(train_df) train_df.info()
株価は、対数価格の増減が正規分布に従うそうですので、検証します
まずは、対数価格を見てみます
train_df = train_df.apply(np.log1p)
プロットしてみる
cols = ['VGSH', 'JEF', 'IVR'] train_df[cols].plot(figsize=(15,5))
train_df[cols].plot( subplots=True, kind='hist', bins=100, figsize=(15,10) )
対数価格そのものは正規分布から外れている
def qqplot(dist): plt.figure(figsize=(5,5)) stats.probplot(dist, dist='norm', plot=plt) plt.show() for col in cols: qqplot(train_df[col])
1週前からの増減をプロットしてみる
train_df[cols].diff(1).fillna(0).plot(figsize=(15,5))
train_df[cols].diff(1).fillna(0).plot( subplots=True, kind='hist', bins=100, figsize=(15,10) )
正規分布に近づいた
for col in cols: qqplot(train_df[col].diff(1).fillna(0))
銘柄ごとに分散が異なるので標準偏差で割ったものをプロットしてみる
train_df[cols].diff(1).fillna(0).apply(lambda x: x / x.std()).plot( figsize=(15,5) )
train_df[cols].diff(1).fillna(0).apply(lambda x: x / x.std()).plot( subplots=True, kind='hist', bins=100, figsize=(15,10) )
年ごとの平均株価
df = pd.Series(index=range(2012,2019+1), dtype=np.float64) for y in df.index: df[y] = train_df.loc[train_df.index.year == y].mean().mean() df.plot(figsize=(15,10))
月ごとの平均株価
df = pd.Series(index=range(1,12+1), dtype=np.float64) for m in df.index: df[m] = train_df.loc[train_df.index.month == m].mean().mean() df.plot(figsize=(15,10))
週ごとの平均株価
df = pd.Series(index=range(1,52+1), dtype=np.float64) for w in df.index: df[w] = train_df.loc[train_df.index.isocalendar().week == w].mean().mean() df.plot(figsize=(15,10))
company_list.csvを読み込む
company_df = pd.read_csv(os.path.join(Config.input_path, 'company_list.csv')).rename(columns={'Symbol':'id'}) print(company_df.shape)
表示してみる
display(company_df) company_df.info()
company_dfに含まれない銘柄
not_exist = list(train_df.columns[~train_df.columns.isin(company_df['id'])]) print(not_exist)
とりあえずダミー追加
for col in not_exist: company_df = company_df.append({'id':col}, ignore_index=True)
なんか多い
company_df = company_df[company_df['id'].isin(train_df.columns)] print(len(company_df))
重複してるのを表示
company_df[company_df.duplicated(subset='id', keep=False)].sort_values('id')
重複は最大2個でListだけが異なっているので、Listを2列にする
company_df['List1'] = company_df[['id', 'List']].groupby('id').transform(lambda x: x.iloc[0]) company_df['List2'] = company_df[['id', 'List']].groupby('id').transform(lambda x: x.iloc[-1]) company_df = company_df.drop('List', axis=1).drop_duplicates(subset='id').reset_index(drop=True) display(company_df) company_df.info()
各Sectorの銘柄数
company_df['Sector'].fillna('nothing', inplace=True) company_df['Sector'].value_counts().plot(kind='bar', figsize=(15,10))
Sectorごとの平均株価
tmp_df = pd.DataFrame(columns=company_df['Sector'].value_counts().index) for sector in tmp_df.columns: tmp_df[sector] = train_df[company_df.loc[company_df['Sector'] == sector, 'id']].mean(axis=1) tmp_df.plot(figsize=(15,10))
各Industryの銘柄数(上位10種類)
company_df['Industry'].fillna('nothing', inplace=True) company_df['Industry'].value_counts()[:10].plot(kind='bar', figsize=(15,10))
Industryごとの平均株価(上位10種類)
tmp_df = pd.DataFrame(columns=company_df['Industry'].value_counts().index[:10]) for sector in tmp_df.columns: tmp_df[sector] = train_df[company_df.loc[company_df['Industry'] == sector, 'id']].mean(axis=1) tmp_df.plot(figsize=(15,10))
各Listの銘柄数
company_df['List1'].fillna('nothing', inplace=True) company_df['List1'].value_counts().plot(kind='bar', figsize=(15,10))
Listごとの平均株価
tmp_df = pd.DataFrame(columns=company_df['List1'].value_counts().index) for sector in tmp_df.columns: tmp_df[sector] = train_df[company_df.loc[company_df['List1'] == sector, 'id']].mean(axis=1) tmp_df.plot(figsize=(15,10))
予測日を追加し日付情報を取り出し
train_df.loc[pd.to_datetime('2019-11-24'), :] = np.nan train_date = pd.Series(train_df.index) train_df.reset_index(drop=True, inplace=True)
各行が1銘柄1週分のデータになるよう変形
train_df = train_df.T.reset_index().rename(columns={'index': 'id'}) train_df = pd.melt( train_df, id_vars='id', value_vars=[week for week in range(420)], var_name='Week', value_name='y' ) display(train_df) train_df.info()
目的変数作成
train_df['y_prev'] = train_df[['id', 'y']].groupby('id')['y'].transform(lambda x: x.shift(1).fillna(method='bfill')) train_df['y_diff'] = train_df['y'] - train_df['y_prev'] train_df['y_diff_std'] = train_df[['id', 'y']].groupby('id')['y'].transform(lambda x: x.std()) train_df['y_diff_norm'] = train_df['y_diff'] / train_df['y_diff_std']
train_df['Year'] = train_df['Week'].map(train_date).dt.year train_df['Month'] = train_df['Week'].map(train_date).dt.month train_df['Day'] = train_df['Week'].map(train_date).dt.day train_df['WeekOfYear'] = train_df['Week'].map(train_date).dt.isocalendar().week.astype(int)
company_dfをマージ
train_df = pd.merge(train_df, company_df, on='id', how='left') display(train_df) train_df.info()
ラベルエンコーディング
train_df['enc_Sector'] = LabelEncoder().fit_transform(train_df['Sector'].fillna('nothing')) train_df['enc_Industry'] = LabelEncoder().fit_transform(train_df['Industry'].fillna('nothing')) train_df['enc_List1'] = LabelEncoder().fit_transform(train_df['List1'].fillna('nothing')) train_df['enc_List2'] = LabelEncoder().fit_transform(train_df['List2'].fillna('nothing'))
lag
def create_lags(df, group_col, val_col, lags): lag_df = pd.DataFrame() for lag in lags: lag_df[f'lag_{lag}_{val_col}'] = df[[group_col, val_col]].groupby( group_col)[val_col].transform(lambda x: x.shift(lag)) return lag_df lag_df = create_lags(train_df, 'id', 'y_diff_norm', [1,2,3,4]) display(lag_df) lag_df.info()
rolling mean/std
def create_rolls(df, group_col, val_col, lags, rolls): roll_df = pd.DataFrame() for lag in lags: for roll in rolls: roll_df[f'rmean_{lag}_{val_col}'] = df[[group_col, val_col]].groupby( group_col)[val_col].transform(lambda x: x.shift(lag).rolling(roll).mean()) roll_df[f'rstd_{lag}_{val_col}'] = df[[group_col, val_col]].groupby( group_col)[val_col].transform(lambda x: x.shift(lag).rolling(roll).std()) return roll_df roll_df = create_rolls(train_df, 'id', 'y_diff_norm', [1], [4, 9, 13, 26, 52]) display(roll_df) roll_df.info()
lag_df/roll_dfをマージ
train_df = pd.concat([train_df, lag_df, roll_df], axis=1) display(train_df) train_df.info()
学習準備
test_df = train_df.loc[train_df['Week'] == 419].reset_index(drop=True) train_df = train_df.loc[train_df['Week'] < 419].reset_index(drop=True) useless_cols = [ 'y', 'y_prev', 'y_diff', 'y_diff_std', 'y_diff_norm', 'Week', 'id', 'Name', 'Sector', 'Industry', 'List1', 'List2' ] usable_cols = train_df.columns[~train_df.columns.isin(useless_cols)] target_col = 'y_diff_norm' x_train = train_df[usable_cols] y_train = train_df[target_col] x_test = test_df[usable_cols]
lightGBMパラメータ
lgb_params = { 'objective': 'regression', 'importance_type': 'gain', 'metric': 'rmse', 'seed': Config.seed, 'n_jobs': -1, 'verbose': -1, 'n_estimators': 500, 'learning_rate': 0.1, 'boosting_type': 'gbdt', 'subsample': 0.5, 'subsample_freq': 1, 'colsample_bytree': 0.5, 'num_leaves': 127, 'min_child_samples': 255, 'max_bin': 100, }
学習/予測
y_diff_std = train_df['y_diff_std'] groups = train_df['id'] y_oof = np.zeros(len(y_train)) y_preds = [] kf = GroupKFold(n_splits=5) for fold, (tr_idx, vl_idx) in enumerate(kf.split(x_train, y_train, groups)): x_tr_fold = x_train.iloc[tr_idx] y_tr_fold = y_train.iloc[tr_idx] x_vl_fold = x_train.iloc[vl_idx] y_vl_fold = y_train.iloc[vl_idx] model = lgb.LGBMRegressor(**lgb_params) model.fit( x_tr_fold, y_tr_fold, eval_set=(x_vl_fold, y_vl_fold), eval_metric='rmse', verbose=False, early_stopping_rounds=100, ) y_oof[vl_idx] = model.predict(x_vl_fold) y_preds.append(model.predict(x_test)) print( f'fold {fold} score:', np.sqrt(np.mean(np.square((y_oof[vl_idx] - y_vl_fold) * y_diff_std[vl_idx]))) ) print( 'oof score:', np.sqrt(np.mean(np.square((y_oof[vl_idx] - y_vl_fold) * y_diff_std[vl_idx]))) )
submissionファイル生成
submission_df = pd.read_csv(os.path.join(Config.input_path, 'submission_template.csv')) submission_df['y'] = np.expm1( np.mean(y_preds, axis=0) * test_df['y_diff_std'].values + test_df['y_prev'].values ) submission_df.to_csv(os.path.join(Config.result_path, 'submission.csv'), index=False)