hirayuki
[動作環境] Google colaboratory
from google.colab import drive
drive.mount('/content/drive/')
Mounted at /content/drive/
# ご自身の環境に合わせてください
%cd /content/drive/My\ Drive/予測コンペ/ProbSpace/YouTube動画視聴回数予測
!ls
/content/drive/My Drive/予測コンペ/ProbSpace/YouTube動画視聴回数予測 lightgbmparams.txt test_data.csv youtube_estimation_prediction.ipynb submission_lgb.csv train_data.csv
import featuretools as ft
import lightgbm as lgb
#import optuna
import numpy as np
import sklearn.datasets
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import xgboost as xgb
import re
import seaborn as sns
from tensorflow import keras
import keras.layers as L
import seaborn as sns
from datetime import datetime, timezone, timedelta
from keras.models import Model
from sklearn.decomposition import PCA
from keras import losses
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
import unicodedata
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm Using TensorFlow backend.
!pip install japanize-matplotlib
import japanize_matplotlib
!pip install jeraconv
from jeraconv import jeraconv
Collecting japanize-matplotlib atplotlib-1.1.1.tar.gz (4.1MB) K |████████████████████████████████| 4.1MB 24kB/s ent already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from japanize-matplotlib) (3.2.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->japanize-matplotlib) (2.4.7) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->japanize-matplotlib) (1.2.0) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->japanize-matplotlib) (0.10.0) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->japanize-matplotlib) (2.8.1) Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib->japanize-matplotlib) (1.18.3) Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib->japanize-matplotlib) (1.12.0) Building wheels for collected packages: japanize-matplotlib Building wheel for japanize-matplotlib (setup.py) ... ?25latplotlib: filename=japanize_matplotlib-1.1.1-cp36-none-any.whl size=4120191 sha256=88555cf8256f4daefec68583523e889cc5e23660465f1e9bbdfd5585c584ced5 Stored in directory: /root/.cache/pip/wheels/c9/97/63/592117b7fd57075ad8942653fd47d7cc0d061311f88e89ab42 Successfully built japanize-matplotlib Installing collected packages: japanize-matplotlib Successfully installed japanize-matplotlib-1.1.1 /usr/local/lib/python3.6/dist-packages/japanize_matplotlib/japanize_matplotlib.py:15: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. font_list = font_manager.createFontList(font_files) Collecting jeraconv Downloading https://files.pythonhosted.org/packages/b6/b0/c4471ecae4fa8ba6143cd828bcc739d1ae442cc668d86eed4ac26a91d1a9/jeraconv-0.2.1-py3-none-any.whl Installing collected packages: jeraconv Successfully installed jeraconv-0.2.1
train = pd.read_csv("train_data.csv")
print("train shape is " + str(train.shape))
train.head(1)
train shape is (19720, 17)
id | video_id | title | publishedAt | channelId | channelTitle | categoryId | collection_date | tags | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | GDtyztIThRQ | [12] BGM Inazuma Eleven 3 - ~ライオコツト ダンジョン~ | 2011-01-09T05:50:33.000Z | UCQaNYC3dNvH8FqrEyK7hTJw | DjangoShiny | 20 | 20.01.02 | Inazuma|Eleven|Super|Once|bgm|ost|イナズマイレブン|Kyo... | 114 | 0 | 7 | https://i.ytimg.com/vi/GDtyztIThRQ/default.jpg | False | False | ~ライオコツト ダンジョン~Inazuma Eleven 3 BGM Complete (R... | 29229 |
1 | 2 | m4H9s3GtTlQ | ねごと - メルシールー [Official Music Video] | 2012-07-23T03:00:09.000Z | UChMWDi-HBm5aS3jyRSaAWUA | ねごと Official Channel | 10 | 20.08.02 | ねごと|ネゴト|メルシールー|Re:myend|リマインド|Lightdentity|ライデ... | 2885 | 50 | 111 | https://i.ytimg.com/vi/m4H9s3GtTlQ/default.jpg | False | False | http://www.negoto.com/全員平成生まれ、蒼山幸子(Vo&Key)、沙田瑞... | 730280 |
2 | 3 | z19zYZuLuEU | VF3tb 闇よだれvsちび太 (SEGA) | 2007-07-26T13:54:09.000Z | UCBdcyoZSt5HBLd_n6we-xIg | siropai | 24 | 20.14.01 | VF3|VF4|VF5|ちび太|闇よだれ|chibita|virtuafighter|seg... | 133 | 17 | 14 | https://i.ytimg.com/vi/z19zYZuLuEU/default.jpg | False | False | Beat-tribe cup finalhttp://ameblo.jp/siropai/ | 80667 |
3 | 4 | pmcIOsL7s98 | free frosty weekend! | 2005-05-15T02:38:43.000Z | UC7K5am1UAQEsCRhzXpi9i1g | Jones4Carrie | 22 | 19.22.12 | frosty | 287 | 51 | 173 | https://i.ytimg.com/vi/pmcIOsL7s98/default.jpg | False | False | I look so bad but look at me! | 34826 |
4 | 5 | ZuQgsTcuM-4 | トップ・オブ・ザ・ワールド | 2007-09-09T09:52:47.000Z | UCTW1um4R-QWa8iIfITGvlZQ | Tatsuya Maruyama | 10 | 20.08.01 | ギター|guitar|南澤大介|トップオブザワールド|トップ|オブ|ワールド|カーペンターズ... | 178 | 6 | 17 | https://i.ytimg.com/vi/ZuQgsTcuM-4/default.jpg | False | False | ソロギターのしらべより「トップオブザワールド」です。クラシックギターで弾いてます。Offic... | 172727 |
5 | 6 | GivuDeAGhyk | ゲンム や スナイプ たちとのグリーティング 💛 仮面ライダーエグゼイドスペシャルショー に... | 2017-01-11T00:34:20.000Z | UCWy5UcrxbfXg5IW47_ntAXQ | はれママ キッズTV | 24 | 20.09.02 | ゲンム|スナイプ|グリーティング|仮面ライダーエグゼイド|ジュウオウジャー|魔法学校の制服|... | 0 | 0 | 53 | https://i.ytimg.com/vi/GivuDeAGhyk/default.jpg | False | True | 先日のよみうりランドで行われた「 仮面ライダーエグゼイドスペシャルショー」の時のグリーティン... | 1358158 |
6 | 7 | yiYr2-6LtcU | Juice=Juice『「ひとりで生きられそう」って それってねえ、褒めているの?』(Pro... | 2019-05-24T08:00:11.000Z | UC6FadPgGviUcq6VQ0CEJqdQ | JuiceJuice | 10 | 20.09.02 | Juice=Juice|JuiceJuice|ジュースジュース|ジュース|ハロー!プロジェク... | 36905 | 394 | 4066 | https://i.ytimg.com/vi/yiYr2-6LtcU/default.jpg | False | False | 2019年6月5日発売のJuice=Juice 12thシングル『「ひとりで生きられそう」っ... | 2881014 |
7 | 8 | TUPHOUN2T30 | Yersiz7-5/8 | 2007-09-01T21:24:46.000Z | UC1zw3DnHyfaA88T8NSn0Upw | AcemCadi | 15 | 20.08.01 | 5 | 5 | 3 | 0 | https://i.ytimg.com/vi/TUPHOUN2T30/default.jpg | False | False | 5 | 12711 |
8 | 9 | kRCi9nxy-Uc | ドリフト専用 GT-R開発ストーリーⅡ ~進化するモンスターマシン 【本編】|TOYO TIRES | 2017-06-27T10:55:01.000Z | UCkW0S2pnXBY2R03jM5C-77Q | TOYO TIRES JAPAN | 2 | 20.09.02 | Drift|Drifting|Nissan GT-R|ドリフト|Team TOYO TIRE... | 4638 | 300 | 439 | https://i.ytimg.com/vi/kRCi9nxy-Uc/default.jpg | False | False | 競技のためのドリフト走行のみを見据え、開発されたTeam TOYO TIRES DRIFTの... | 1003949 |
9 | 10 | G6s8HF1WsJY | BUMP OF CHICKEN「話がしたいよ」 | 2018-10-14T15:00:02.000Z | UCOfESRUR5duQ2hMnTQ4oqhA | BUMP OF CHICKEN | 10 | 20.09.02 | BUMP OF CHICKEN|億男 | 80206 | 1545 | 11012 | https://i.ytimg.com/vi/G6s8HF1WsJY/default.jpg | False | False | BUMP OF CHICKEN「話がしたいよ」※映画『億男』主題歌2018.10.15 (m... | 13039631 |
test = pd.read_csv("test_data.csv")
iddf = test[["id"]]
print("test shape is " + str(test.shape))
test.head(1)
test shape is (29582, 16)
id | video_id | title | publishedAt | channelId | channelTitle | categoryId | collection_date | tags | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | xU8UcB6RbLE | Frightened Rabbit - The Greys | 2007-09-26T11:00:07.000Z | UCOQ_j8Qg4-p0lGKBpXYENbg | Fatcat Records | 10 | 20.08.01 | Fatcat|Records|Frightened|Rabbit|The|Greys | 471 | 38 | 61 | https://i.ytimg.com/vi/xU8UcB6RbLE/default.jpg | False | False | Director: Fraser CampbellDate:2007Taken from F... |
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")
train["y"] = np.log(train["y"])
mean_ = train[["categoryId", "y"]].groupby("categoryId").mean().reset_index().rename({"y":"mean"}, axis=1)
max_ = train[["categoryId", "y"]].groupby("categoryId").max().reset_index().rename({"y":"max"}, axis=1)
min_ = train[["categoryId", "y"]].groupby("categoryId").min().reset_index().rename({"y":"min"}, axis=1)
std_ = train[["categoryId", "y"]].groupby("categoryId").std().reset_index().rename({"y":"std"}, axis=1)
count_ = train[["categoryId", "y"]].groupby("categoryId").count().reset_index().rename({"y":"count"}, axis=1)
q1_ = train[["categoryId", "y"]].groupby("categoryId").quantile(0.1).reset_index().rename({"y":"q1"}, axis=1)
q25_ = train[["categoryId", "y"]].groupby("categoryId").quantile(0.25).reset_index().rename({"y":"q25"}, axis=1)
q5_ = train[["categoryId", "y"]].groupby("categoryId").quantile(0.5).reset_index().rename({"y":"q5"}, axis=1)
q75_ = train[["categoryId", "y"]].groupby("categoryId").quantile(0.75).reset_index().rename({"y":"q75"}, axis=1)
q9_ = train[["categoryId", "y"]].groupby("categoryId").quantile(0.9).reset_index().rename({"y":"q9"}, axis=1)
def is_japanese(string):
for ch in string:
try:
name = unicodedata.name(ch)
if "CJK UNIFIED" in name \
or "HIRAGANA" in name \
or "KATAKANA" in name:
return True
except:
continue
return False
y = train["y"]
del train["y"]
df = pd.concat([train, test])
df["tags"].fillna("[none]", inplace=True)
tagdic = dict(pd.Series("|".join(list(df["tags"])).split("|")).value_counts().sort_values())
def bool_to_int(df):
df["comments_disabled"] = df["comments_disabled"].astype(np.int16)
df["ratings_disabled"] = df["ratings_disabled"].astype(np.int16)
return df
def create_features(df):
# like dislike comment
#df["likes2"] = df["likes"]**2
df["loglikes"] = np.log(df["likes"]+1)
#df["dislikes2"] = df["dislikes"]**2
df["logdislikes"] = np.log(df["dislikes"]+1)
df["logcomment_count"] = np.log(df["comment_count"]+1)
df["sqrtlikes"] = np.sqrt(df["likes"])
df["like_dislike_ratio"] = df["likes"]/(df["dislikes"]+1)
df["comments_like_ratio"] = df["comment_count"]/(df["likes"]+1)
df["comments_dislike_ratio"] = df["comment_count"]/(df["dislikes"]+1)
# likes comments diable
df["likes_com"] = df["likes"] * df["comments_disabled"]
df["dislikes_com"] = df["dislikes"] * df["comments_disabled"]
df["comments_likes"] = df["comment_count"] * df["ratings_disabled"]
# tags
df["num_tags"] = df["tags"].astype(str).apply(lambda x: len(x.split("|")))
df["length_tags"] = df["tags"].astype(str).apply(lambda x: len(x))
df["tags_point"] = df["tags"].apply(lambda tags: sum([tagdic[tag] for tag in tags.split("|")]))
df["count_en_tag"] = df["tags"].apply(lambda x: sum([bool(re.search(r'[a-zA-Z0-9]', x_)) for x_ in x.split("|")]))
df["count_ja_tag"] = df["tags"].apply(lambda x: sum([is_japanese(x_) for x_ in x.split("|")]))
# publishedAt
df["publishedAt"] = pd.to_datetime(df["publishedAt"], utc=True)
df["publishedAt_year"] = df["publishedAt"].apply(lambda x: x.year)
df["publishedAt_month"] = df["publishedAt"].apply(lambda x: x.month)
df["publishedAt_day"] = df["publishedAt"].apply(lambda x: x.day)
df["publishedAt_hour"] = df["publishedAt"].apply(lambda x: x.hour)
df["publishedAt_minute"] = df["publishedAt"].apply(lambda x: x.minute)
#df["publishedAt_second"] = df["publishedAt"].apply(lambda x: x.second)
df["publishedAt_dayofweek"] = df["publishedAt"].apply(lambda x: x.dayofweek)
# collection_date
#df["collection_date_year"] = df["collection_date"].apply(lambda x: int(x[0:2]))
df["collection_date_month"] = df["collection_date"].apply(lambda x: int(x[3:5]))
df["collection_date_day"] = df["collection_date"].apply(lambda x: int(x[6:8]))
df["collection_date"] = pd.to_datetime("20"+df["collection_date"], format="%Y.%d.%m", utc=True)
# delta
df["delta"] = (df["collection_date"] - df["publishedAt"]).apply(lambda x: x.days)
df["logdelta"] = np.log(df["delta"])
df["sqrtdelta"] = np.sqrt(df["delta"])
df["published_delta"] = (df["publishedAt"] - df["publishedAt"].min()).apply(lambda x: x.days)
df["collection_delta"] = (df["collection_date"] - df["collection_date"].min()).apply(lambda x: x.days)
df["description"].fillna(" ", inplace=True)
df["ishttp_in_dis"] = df["description"].apply(lambda x: x.lower().count("http"))
df["len_description"] = df["description"].apply(lambda x: len(x))
df["title"].fillna(" ", inplace=True)
df["len_title"] = df["title"].apply(lambda x: len(x))
# is japanese
df["isJa_title"] = df["title"].apply(lambda x: is_japanese(x))
df["isJa_tags"] = df["tags"].apply(lambda x: is_japanese(x))
df["isJa_description"] = df["description"].apply(lambda x: is_japanese(x))
# is englosh
#df["onEn_title"] = df["title"].apply(lambda x: x.encode('utf-8').isalnum())
df["onEn_tags"] = df["tags"].apply(lambda x: x.encode('utf-8').isalnum())
df["onEn_description"] = df["description"].apply(lambda x: x.encode('utf-8').isalnum())
# cotain englosh
df["conEn_title"] = df["title"].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
df["conEn_tags"] = df["tags"].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
df["conEn_description"] = df["description"].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
# Music
df["music_title"] = df["title"].apply(lambda x: "music" in x.lower())
df["music_tags"] = df["tags"].apply(lambda x: "music" in x.lower())
df["music_description"] = df["description"].apply(lambda x: "music" in x.lower())
# Official
df["isOff"] = df["title"].apply(lambda x: "fficial" in x.lower())
df["isOffChannell"] = df["channelTitle"].apply(lambda x: "fficial" in x.lower())
df["isOffJa"] = df["title"].apply(lambda x: "公式" in x.lower())
df["isOffChannellJa"] = df["channelTitle"].apply(lambda x: "公式" in x.lower())
# Music
df["cm_title"] = df["title"].apply(lambda x: "cm" in x.lower())
df["cm_tags"] = df["tags"].apply(lambda x: "cm" in x.lower())
df["cm_description"] = df["description"].apply(lambda x: "cm" in x.lower())
df = df.merge(mean_, how='left', on=["categoryId"])
df = df.merge(max_, how='left', on=["categoryId"])
df = df.merge(min_, how='left', on=["categoryId"])
df = df.merge(std_, how='left', on=["categoryId"])
#df = df.merge(count_, how='left', on=["categoryId"])
df = df.merge(q1_, how='left', on=["categoryId"])
df = df.merge(q25_, how='left', on=["categoryId"])
df = df.merge(q5_, how='left', on=["categoryId"])
df = df.merge(q75_, how='left', on=["categoryId"])
df = df.merge(q9_, how='left', on=["categoryId"])
# 出現頻度
for col in ["categoryId", "channelTitle"]:
freq = df[col].value_counts()
df["freq_"+col] = df[col].map(freq)
return df
#df['categoryId'] = df['categoryId'].astype('category')
df = bool_to_int(df)
df = create_features(df)
del df["channelId"]
del df["video_id"]
del df["title"]
del df["description"]
del df["thumbnail_link"]
del df["channelTitle"]
del df["tags"]
del df["publishedAt"]
del df["collection_date"]
del df["id"]
scalar = StandardScaler()
scalar.fit(df)
df = pd.DataFrame(scalar.transform(df), columns=df.columns)
X = df.iloc[:len(y), :]
test = df.iloc[len(y):, :]
def rmsle(preds, data):
y_true = data.get_label()
y_pred = preds
y_pred[y_pred<0] = 0
y_true[y_true<0] = 0
acc = np.sqrt(mean_squared_log_error(np.exp(y_true), np.exp(y_pred)))
# name, result, is_higher_better
return 'accuracy', acc, False
# Optunaの最適化パラメータを代入する
light_params = {'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
'verbosity': -1,
"seed":42,
'learning_rate': 0.01,}
best_params = {'lambda_l1': 0.019918875912078603, 'lambda_l2': 0.002616688073257713, 'num_leaves': 219, 'feature_fraction': 0.6641013611124621, 'bagging_fraction': 0.7024199018549259, 'bagging_freq': 5, 'min_child_samples': 5}
#best_params = {}
light_params.update(best_params)
xgb_params = {'learning_rate': 0.1,
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'seed': 42,
'tree_method': 'hist'}
best_params = {'learning_rate': 0.01665914389764044, 'lambda_l1': 4.406831762257336, 'num_leaves': 39}
#best_params = {}
xgb_params.update(best_params)
FOLD_NUM = 11
kf = KFold(n_splits=FOLD_NUM,
shuffle=True,
random_state=42)
scores = []
feature_importance_df = pd.DataFrame()
pred_cv = np.zeros(len(test.index))
num_round = 10000
for i, (tdx, vdx) in enumerate(kf.split(X, y)):
print(f'Fold : {i}')
######LGB
X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y.values[tdx], y.values[vdx]
# LGB
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)
gbc = lgb.train(light_params, lgb_train, num_boost_round=num_round,
valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
#feval=rmsle,
early_stopping_rounds=100, verbose_eval=500)
if i ==0:
importance_df = pd.DataFrame(gbc.feature_importance(), index=X.columns, columns=['importance'])
else:
importance_df += pd.DataFrame(gbc.feature_importance(), index=X.columns, columns=['importance'])
gbc_va_pred = np.exp(gbc.predict(X_valid, num_iteration=gbc.best_iteration))
gbc_va_pred[gbc_va_pred<0] = 0
# XGB
xgb_dataset = xgb.DMatrix(X_train, label=y_train)
xgb_test_dataset = xgb.DMatrix(X_valid, label=y_valid)
xgbm = xgb.train(xgb_params, xgb_dataset, 10000, evals=[(xgb_dataset, 'train'),(xgb_test_dataset, 'eval')],
early_stopping_rounds=100, verbose_eval=500)
xgbm_va_pred = np.exp(xgbm.predict(xgb.DMatrix(X_valid)))
xgbm_va_pred[xgbm_va_pred<0] = 0
# ENS
# lists for keep results
lgb_xgb_rmsle = []
lgb_xgb_alphas = []
for alpha in np.linspace(0,1,101):
y_pred = alpha*gbc_va_pred + (1 - alpha)*xgbm_va_pred
rmsle_score = np.sqrt(mean_squared_log_error(np.exp(y_valid), y_pred))
lgb_xgb_rmsle.append(rmsle_score)
lgb_xgb_alphas.append(alpha)
lgb_xgb_rmsle = np.array(lgb_xgb_rmsle)
lgb_xgb_alphas = np.array(lgb_xgb_alphas)
lgb_xgb_best_alpha = lgb_xgb_alphas[np.argmin(lgb_xgb_rmsle)]
print('best_rmsle=', lgb_xgb_rmsle.min())
print('best_alpha=', lgb_xgb_best_alpha)
plt.plot(lgb_xgb_alphas, lgb_xgb_rmsle)
plt.title('f1_score for ensemble')
plt.xlabel('alpha')
plt.ylabel('f1_score')
score_ = lgb_xgb_rmsle.min()
scores.append(score_)
lgb_submission = np.exp(gbc.predict((test), num_iteration=gbc.best_iteration))
lgb_submission[lgb_submission<0] = 0
xgbm_submission = np.exp(xgbm.predict(xgb.DMatrix(test)))
xgbm_submission[xgbm_submission<0] = 0
submission = lgb_xgb_best_alpha*lgb_submission + (1 - lgb_xgb_best_alpha)*xgbm_submission
pred_cv += submission/FOLD_NUM
print("##########")
print(np.mean(scores))
Fold : 0 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.415821 valid's rmse: 0.791254 [1000] train's rmse: 0.272764 valid's rmse: 0.780818 [1500] train's rmse: 0.191684 valid's rmse: 0.77976 Early stopping, best iteration is: [1802] train's rmse: 0.156017 valid's rmse: 0.779337 [0] train-rmse:12.0564 eval-rmse:12.0558 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.663573 eval-rmse:0.802208 [1000] train-rmse:0.562327 eval-rmse:0.78955 [1500] train-rmse:0.489989 eval-rmse:0.781737 Stopping. Best iteration: [1605] train-rmse:0.476654 eval-rmse:0.781063 best_rmsle= 0.7723708735179381 best_alpha= 0.53 Fold : 1 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.415948 valid's rmse: 0.806761 [1000] train's rmse: 0.274257 valid's rmse: 0.796092 [1500] train's rmse: 0.19296 valid's rmse: 0.793606 Early stopping, best iteration is: [1683] train's rmse: 0.170399 valid's rmse: 0.793351 [0] train-rmse:12.05 eval-rmse:12.1199 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.665669 eval-rmse:0.821273 [1000] train-rmse:0.564601 eval-rmse:0.808212 Stopping. Best iteration: [1384] train-rmse:0.508843 eval-rmse:0.803632 best_rmsle= 0.7901604684138034 best_alpha= 0.6900000000000001 Fold : 2 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.41695 valid's rmse: 0.772208 [1000] train's rmse: 0.274123 valid's rmse: 0.765467 Early stopping, best iteration is: [1260] train's rmse: 0.227273 valid's rmse: 0.76395 [0] train-rmse:12.0632 eval-rmse:11.9877 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.660115 eval-rmse:0.775522 Stopping. Best iteration: [799] train-rmse:0.594599 eval-rmse:0.764784 best_rmsle= 0.7566571288792217 best_alpha= 0.52 Fold : 3 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.416415 valid's rmse: 0.813777 [1000] train's rmse: 0.273949 valid's rmse: 0.806612 [1500] train's rmse: 0.192279 valid's rmse: 0.805145 [2000] train's rmse: 0.137812 valid's rmse: 0.803946 Early stopping, best iteration is: [2005] train's rmse: 0.137409 valid's rmse: 0.803895 [0] train-rmse:12.0584 eval-rmse:12.0349 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.657406 eval-rmse:0.819604 [1000] train-rmse:0.567473 eval-rmse:0.811771 Stopping. Best iteration: [1131] train-rmse:0.548988 eval-rmse:0.810911 best_rmsle= 0.7990827363059435 best_alpha= 0.66 Fold : 4 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.419167 valid's rmse: 0.759758 [1000] train's rmse: 0.278153 valid's rmse: 0.750938 Early stopping, best iteration is: [967] train's rmse: 0.284909 valid's rmse: 0.75067 [0] train-rmse:12.0558 eval-rmse:12.0622 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.660001 eval-rmse:0.767437 [1000] train-rmse:0.565929 eval-rmse:0.755205 Stopping. Best iteration: [1322] train-rmse:0.521751 eval-rmse:0.751779 best_rmsle= 0.7444942482023515 best_alpha= 0.53 Fold : 5 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.419538 valid's rmse: 0.731647 [1000] train's rmse: 0.277492 valid's rmse: 0.725321 Early stopping, best iteration is: [1383] train's rmse: 0.210865 valid's rmse: 0.724353 [0] train-rmse:12.042 eval-rmse:12.1995 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.662515 eval-rmse:0.75609 [1000] train-rmse:0.56817 eval-rmse:0.748726 Stopping. Best iteration: [1184] train-rmse:0.537205 eval-rmse:0.745688 best_rmsle= 0.7234032824334943 best_alpha= 0.87 Fold : 6 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.415977 valid's rmse: 0.774832 [1000] train's rmse: 0.273224 valid's rmse: 0.761787 [1500] train's rmse: 0.191381 valid's rmse: 0.757047 [2000] train's rmse: 0.137662 valid's rmse: 0.755709 [2500] train's rmse: 0.100612 valid's rmse: 0.755097 Early stopping, best iteration is: [2420] train's rmse: 0.105658 valid's rmse: 0.754955 [0] train-rmse:12.0622 eval-rmse:11.9974 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.652981 eval-rmse:0.793385 [1000] train-rmse:0.560193 eval-rmse:0.783758 Stopping. Best iteration: [1092] train-rmse:0.548297 eval-rmse:0.783011 best_rmsle= 0.7534783804246369 best_alpha= 0.91 Fold : 7 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.418163 valid's rmse: 0.778172 [1000] train's rmse: 0.276888 valid's rmse: 0.765684 [1500] train's rmse: 0.194717 valid's rmse: 0.762378 [2000] train's rmse: 0.140457 valid's rmse: 0.761087 [2500] train's rmse: 0.102144 valid's rmse: 0.760373 Early stopping, best iteration is: [2801] train's rmse: 0.0850574 valid's rmse: 0.760061 [0] train-rmse:12.0646 eval-rmse:11.9738 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.657884 eval-rmse:0.796695 [1000] train-rmse:0.560982 eval-rmse:0.788118 Stopping. Best iteration: [1140] train-rmse:0.542157 eval-rmse:0.786472 best_rmsle= 0.7575249298157162 best_alpha= 0.9 Fold : 8 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.417445 valid's rmse: 0.767531 [1000] train's rmse: 0.27553 valid's rmse: 0.761769 [1500] train's rmse: 0.193106 valid's rmse: 0.760943 Early stopping, best iteration is: [1414] train's rmse: 0.205101 valid's rmse: 0.760651 [0] train-rmse:12.0528 eval-rmse:12.0927 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.663993 eval-rmse:0.79707 [1000] train-rmse:0.563801 eval-rmse:0.784384 Stopping. Best iteration: [1372] train-rmse:0.507814 eval-rmse:0.781689 best_rmsle= 0.7593112793829914 best_alpha= 0.8300000000000001 Fold : 9 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.417428 valid's rmse: 0.739369 [1000] train's rmse: 0.275828 valid's rmse: 0.729646 [1500] train's rmse: 0.194096 valid's rmse: 0.72802 Early stopping, best iteration is: [1565] train's rmse: 0.185543 valid's rmse: 0.727748 [0] train-rmse:12.0598 eval-rmse:12.0211 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.660396 eval-rmse:0.769484 [1000] train-rmse:0.563762 eval-rmse:0.764182 Stopping. Best iteration: [980] train-rmse:0.566838 eval-rmse:0.763706 best_rmsle= 0.7275074896979024 best_alpha= 1.0 Fold : 10 Training until validation scores don't improve for 100 rounds. [500] train's rmse: 0.418908 valid's rmse: 0.738958 [1000] train's rmse: 0.276098 valid's rmse: 0.726298 [1500] train's rmse: 0.193828 valid's rmse: 0.722673 [2000] train's rmse: 0.139249 valid's rmse: 0.720791 [2500] train's rmse: 0.101464 valid's rmse: 0.719539 Early stopping, best iteration is: [2473] train's rmse: 0.103164 valid's rmse: 0.719501 [0] train-rmse:12.0546 eval-rmse:12.0737 Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping. Will train until eval-rmse hasn't improved in 100 rounds. [500] train-rmse:0.661527 eval-rmse:0.771652 [1000] train-rmse:0.573934 eval-rmse:0.760953 Stopping. Best iteration: [1086] train-rmse:0.561877 eval-rmse:0.760279 best_rmsle= 0.7188409066997462 best_alpha= 1.0 ########## 0.7548028839794315
# 0.765394813186907
pd.set_option('display.max_rows', None)
importance_df.sort_values("importance")
importance | |
---|---|
onEn_description | 806 |
onEn_tags | 1250 |
comments_likes | 2150 |
cm_tags | 2190 |
isOffJa | 2496 |
music_title | 2685 |
isOffChannell | 2805 |
isOff | 3193 |
isJa_tags | 3296 |
cm_title | 3534 |
music_tags | 3539 |
cm_description | 3580 |
ratings_disabled | 3847 |
comments_disabled | 4355 |
isOffChannellJa | 4815 |
dislikes_com | 5051 |
music_description | 5073 |
collection_date_day | 5210 |
likes_com | 8298 |
isJa_description | 8341 |
q25 | 9796 |
q5 | 9862 |
isJa_title | 10291 |
q75 | 11264 |
q9 | 12511 |
collection_delta | 14985 |
freq_categoryId | 19160 |
sqrtlikes | 24758 |
q1 | 25393 |
publishedAt_year | 25643 |
sqrtdelta | 27559 |
logcomment_count | 29825 |
max | 32681 |
min | 34121 |
std | 36496 |
mean | 37052 |
ishttp_in_dis | 44911 |
logdislikes | 45719 |
published_delta | 47415 |
collection_date_month | 49442 |
categoryId | 51309 |
loglikes | 53635 |
count_ja_tag | 55223 |
count_en_tag | 59189 |
publishedAt_dayofweek | 59426 |
logdelta | 61850 |
comment_count | 72629 |
num_tags | 74003 |
publishedAt_month | 75206 |
dislikes | 85178 |
comments_dislike_ratio | 86921 |
comments_like_ratio | 104612 |
conEn_tags | 104945 |
likes | 105238 |
publishedAt_hour | 107375 |
conEn_title | 109185 |
freq_channelTitle | 111399 |
length_tags | 114020 |
publishedAt_day | 115737 |
like_dislike_ratio | 116834 |
publishedAt_minute | 123132 |
conEn_description | 129786 |
len_title | 133314 |
tags_point | 145636 |
delta | 145769 |
len_description | 150619 |
light_submission_df = pd.concat([iddf, pd.DataFrame(pred_cv)], axis=1)
light_submission_df.columns = ["id", "y"]
light_submission_df.to_csv("submission_lgb.csv", index=False)
print("end")
end
from keras import losses
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers.convolutional import Conv1D, UpSampling1D
from keras.layers.pooling import MaxPool1D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers import BatchNormalization
from keras.layers.pooling import MaxPooling1D
from keras.callbacks import LearningRateScheduler
from sklearn.preprocessing import OneHotEncoder
from keras.optimizers import Adam
# 学習率
def step_decay(epoch):
x = 0.01
if epoch >= 120: x = 0.001
return x
lr_decay = LearningRateScheduler(step_decay)
# Optunaの最適化パラメータを代入する
def create_mlp(shape):
'''
Returns a keras model
'''
print(f"shape: {shape}")
model = Sequential()
model.add(Conv1D(32, 3, activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=shape))
model.add(BatchNormalization())
model.add(Conv1D(32, 3, activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(Conv1D(64, 3, activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))
model.add(Dropout(0.3))
model.add(Conv1D(128, 3, activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(Conv1D(128, 3, activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling1D(2))
model.add(Dropout(0.4))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1))
return model
FOLD_NUM = 4
kf = KFold(n_splits=FOLD_NUM,
shuffle=True,
random_state=42)
scores = []
feature_importance_df = pd.DataFrame()
pred_cv = np.zeros(len(test.index))
for i, (tdx, vdx) in enumerate(kf.split(X, y)):
print(f'Fold : {i}')
X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y.values[tdx], y.values[vdx]
mlp = create_mlp((X_train.values.shape[1], 1))
optimizer = Adam(lr=0.001)
mlp.compile(optimizer=optimizer, loss=losses.mean_squared_error)
mlp.fit(x=np.reshape(X_train.values, (-1, X_train.shape[1], 1)), y=y_train.reshape(len(y_train),1),
epochs=150, batch_size=493,
validation_data=(np.reshape(X_valid.values, (-1, X_valid.shape[1], 1)), y_valid),
callbacks=[lr_decay])#, verbose=0)
mlp_pred = mlp.predict(np.reshape(X_valid.values, (-1, X_train.shape[1], 1)))
plt.plot(mlp.history.history['loss'][3:], 'r', label='loss', alpha=0.7)
plt.plot(mlp.history.history['val_loss'][3:], label='val_loss', alpha=0.7)
plt.show()
rmsle_score = np.sqrt(mean_squared_log_error(np.exp(y_valid), np.exp(mlp_pred)))
print(rmsle_score)
break
# memo
#Epoch 230/400
#14790/14790 [==============================] - 4s 301us/step - loss: 1.9379 - val_loss: 0.7369
np.sqrt(mean_squared_log_error(np.exp(y_valid), np.exp(mlp_pred)))
!pip install optuna
import optuna
def objective(trial):
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
'verbosity': -1,
"seed":42,
"learning_rate":trial.suggest_loguniform('learning_rate', 0.005, 0.03),
'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
'num_leaves': trial.suggest_int('num_leaves', 2, 256),
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
}
FOLD_NUM = 4
kf = KFold(n_splits=FOLD_NUM,
#shuffle=True,
random_state=42)
scores = []
feature_importance_df = pd.DataFrame()
pred_cv = np.zeros(len(test.index))
num_round = 10000
for i, (tdx, vdx) in enumerate(kf.split(X, y)):
print(f'Fold : {i}')
X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y.values[tdx], y.values[vdx]
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)
model = lgb.train(params, lgb_train, num_boost_round=num_round,
valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
early_stopping_rounds=10, verbose_eval=10000)
va_pred = model.predict(X_valid)
score_ = np.sqrt(mean_squared_error(y_valid, va_pred))
scores.append(score_)
return np.mean(scores)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
# 結果の確認
print('Best trial:')
light_trial = study.best_trial
print(' Value: {}'.format(light_trial.value))
print(' Params: ')
with open("lightgbmparams.txt", "w") as file:
for key, value in light_trial.params.items():
print(' "{}": {},'.format(key, value))
file.write('"{}": {},'.format(key, value))
#0.7894605792171627
def objective(trial):
params = {
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'seed': 42,
'tree_method': 'hist',
"learning_rate":trial.suggest_loguniform('learning_rate', 0.005, 0.03),
'lambda_': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
#'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
}
FOLD_NUM = 4
kf = KFold(n_splits=FOLD_NUM,
#shuffle=True,
random_state=42)
scores = []
feature_importance_df = pd.DataFrame()
pred_cv = np.zeros(len(test.index))
num_round = 10000
for i, (tdx, vdx) in enumerate(kf.split(X, y)):
print(f'Fold : {i}')
X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y.values[tdx], y.values[vdx]
# XGB
xgb_dataset = xgb.DMatrix(X_train, label=y_train)
xgb_test_dataset = xgb.DMatrix(X_valid, label=y_valid)
xgbm = xgb.train(params, xgb_dataset, 10000, evals=[(xgb_dataset, 'train'),(xgb_test_dataset, 'eval')],
early_stopping_rounds=100, verbose_eval=5000)
xgbm_va_pred = xgbm.predict(xgb.DMatrix(X_valid))
xgbm_va_pred[xgbm_va_pred<0] = 0
score_ = np.sqrt(mean_squared_error(y_valid, xgbm_va_pred))
scores.append(score_)
return np.mean(scores)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
# 結果の確認
print('Best trial:')
light_trial = study.best_trial
print(' Value: {}'.format(light_trial.value))
print(' Params: ')
with open("lightgbmparams.txt", "w") as file:
for key, value in light_trial.params.items():
print(' "{}": {},'.format(key, value))
file.write('"{}": {},'.format(key, value))
#non turning params 0.7897379694106698
hirayuki
f1_scoreとかCMの特徴量のところに #musicとか、汚いまま載せてすみません。。。両方嘘です。
他のコンペに利用したコードを流用したせいです。。
あたりが他と差別化できた特徴量です。英語圏でupされた動画の方が再生数が多いと言う仮説が効きました。
また目的変数を対数変換することは
https://prob.space/competitions/youtube-view-count/discussions/chizuchizu-Post65af4bcca79bc71bb1b9
のchizuchizuさんから学びました。
yuki810
ありがとうございます。 とても参考になりました!