Simple Pytorch

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold


TRAIN_DATA_PATH = "./data/train_data.csv"
TEST_DATA_PATH = "./data/test_data.csv"
# e-toppoさんの申請してくださった武器詳細データ
# https://prob.space/competitions/game_winner/discussions/e-toppo-Post0082a60376ef134af3a4
WEAPON_DATA_PATH = "./data/weapon.csv"


cat_features = [
    "period", "game-ver", "lobby-mode", "lobby", "mode", "stage", "A1-weapon",
    "A2-weapon", "A3-weapon", "A4-weapon", "B1-weapon", "B2-weapon", "B3-weapon", "B4-weapon",
    "A1-rank", "A2-rank", "A3-rank", "A4-rank", "B1-rank", "B2-rank", "B3-rank", "B4-rank", 
]
num_features = [
    "A1-level", "A2-level", "A3-level", "A4-level",
    "B1-level", "B2-level", "B3-level", "B4-level"
]
target_col = "y"
target_id = "id"
players = ["A1", "A2", "A3", "A4", "B1", "B2", "B3", "B4"]


train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
weapon_detail_df = pd.read_csv(WEAPON_DATA_PATH)



cat_features.remove("period")
cat_features.remove("lobby")
cat_features.remove("game-ver")
data = pd.concat([train.drop([target_id, target_col], axis=1), test.drop([target_id], axis=1)])
data = data.drop(["period", "game-ver", "lobby"], axis=1)
y = torch.tensor(train[target_col].values).flatten()
test_id = test[target_id]

weapon_detail_df = weapon_detail_df.drop([
    '[de-DE]', '[en-GB]', '[en-US]', '[es-ES]',
    '[es-MX]', '[fr-CA]', '[fr-FR]', '[it-IT]', '[ja-JP]', '[nl-NL]',
    '[ru-RU]', "reskin", "splatnet"], axis=1)
weapon_detail_df["category"] = weapon_detail_df["category1"] + weapon_detail_df["category2"]
weapon_detail_df = weapon_detail_df.drop(["category1", "category2"], axis=1)

weapons = [p + "-weapon" for p in players]
for player in players:
    data[f"{player}-rank"] = data[f"{player}-rank"].fillna("-1")
    # 武器のnanは最も多い武器で置換する
    data[f"{player}-weapon"] = data[f"{player}-weapon"].fillna(data[f"{player}-weapon"].value_counts().index[0])
    data[f"{player}-level"] = data[f"{player}-level"].fillna("-1")


for weapon in weapons:
    temp_weapon_detail = data[[weapon]].join(weapon_detail_df.set_index("key"), on=weapon).drop(weapon, axis=1)
    weapon_detail = [weapon[:3] + col for col in temp_weapon_detail.columns]
    cat_features.extend(weapon_detail)
    temp_weapon_detail.columns = weapon_detail
    data = pd.concat([data, temp_weapon_detail], axis=1)
    data = data.drop(weapon, axis=1)
    cat_features.remove(weapon)
    

cat_szs = []
for cat in cat_features:
    data[cat] = data[cat].astype('category')
    cat_szs.append(len(data[cat].cat.categories))
    data[cat] = data[cat].cat.codes.values


emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
    
X = data[:len(train.index)]
X_test = data[len(train.index):]

nums = np.stack([X[col].astype(np.float16).values for col in num_features], 1)
nums = torch.tensor(nums, dtype=torch.float)
cats = np.stack([X[col].values for col in cat_features], 1)
cats = torch.tensor(cats, dtype=torch.int64)
print(f"nums shape: {nums.shape}")
print(f"cats shape: {cats.shape}")
nums_test = np.stack([X_test[col].astype(np.float16).values for col in num_features], 1)
nums_test = torch.tensor(nums_test, dtype=torch.float)
cats_test = np.stack([X_test[col].values for col in cat_features], 1)
cats_test = torch.tensor(cats_test, dtype=torch.int64)



class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

torch.manual_seed(33)
model = TabularModel(emb_szs, nums.shape[1], 2, [200,100], p=0.3)
# print(model)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


kf = KFold(n_splits=4, random_state=42)


is_short_run = True
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
    cat_train = cats[train_index]
    cat_valid = cats[valid_index]
    num_train = nums[train_index]
    num_valid = nums[valid_index]
    y_train = y[train_index]
    y_valid = y[valid_index]
    print(f"cat_train: {len(cat_train)}, cat_valid {len(cat_valid)}, num_train {len(num_train)}, num_valid {len(num_valid)}")
    print(f"y_train: {len(y_train)}, y_valid: {len(y_valid)}")


    start_time = time.time()

    epochs = 10
    losses = []
    valid_losses = []

    for i in range(epochs):
        i+=1
        y_pred = model(cat_train, num_train)
        loss = criterion(y_pred, y_train)
        losses.append(loss)

        if i%10 == 1:
            print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            y_val = model(cat_valid, num_valid)
            valid_loss = criterion(y_val, y_valid)
            valid_losses.append(valid_loss)
            if i%10 == 1:
                print(f'epoch: {i:3}  valid loss: {valid_loss.item():10.8f}')

    print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
    print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

    plt.plot(range(epochs), losses)
    plt.plot(range(epochs), valid_losses)
    plt.ylabel('Cross Entropy Loss')
    plt.xlabel('epoch');

    with torch.no_grad():
        y_val = model(cat_valid, num_valid)
        accuracy = accuracy_score(np.argmax(y_val, axis=1), y_valid)
        loss = criterion(y_val, y_valid)
    print(f'CE Loss: {loss:.8f}')
    print(f'accuracy: {accuracy:.8f}')
    
    with torch.no_grad():
        y_test = np.argmax(model(cats_test, nums_test), axis=1)
    if is_short_run:
        break
    
submission_df = pd.DataFrame({target_id: test_id.values, target_col: y_test})
submission_df.to_csv("submission/submission_single_pytorch.csv", index=False)
nums shape: torch.Size([66125, 8])
cats shape: torch.Size([66125, 43])
cat_train: 49593, cat_valid 16532, num_train 49593, num_valid 16532
y_train: 49593, y_valid: 16532
epoch:   1  loss: 0.79562783
epoch:  10  loss: 0.71883851

Duration: 131 seconds
CE Loss: 0.72617549
accuracy: 0.51227922

添付データ

  • simple_pytorch.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241123T081255Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。