Simple EDA

はじめに

与えられたデータを元に簡単に可視化をしてみました。皆様の参考になれば幸いです。

ライブラリのインポート、データの読み込み

import collections
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
from pprint import pprint

import nltk
from nltk import stem
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")
nltk.download("stopwords")
nltk.download("punkt")
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yshr10ic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yshr10ic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
True
train_df = pd.read_csv("data/train_data.csv")
test_df = pd.read_csv("data/test_data.csv")
all_df = pd.concat([train_df, test_df], axis=1)

基本情報

print(train_df.shape)
print(test_df.shape)
(4974, 6)
(6393, 5)
display(train_df.info())
display(test_df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4974 entries, 0 to 4973
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4974 non-null   int64 
 1   title     4974 non-null   object
 2   year      4974 non-null   int64 
 3   abstract  4974 non-null   object
 4   keywords  4494 non-null   object
 5   y         4974 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 233.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6393 entries, 0 to 6392
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6393 non-null   int64 
 1   title     6393 non-null   object
 2   year      6393 non-null   int64 
 3   abstract  6393 non-null   object
 4   keywords  5618 non-null   object
dtypes: int64(2), object(3)
memory usage: 249.9+ KB
None

keywordsのみ欠損値がある

重複の確認

print(f"titleの重複数: {len(all_df[all_df.duplicated(subset=['title'])])}")
print(f"abstractの重複数: {len(all_df[all_df.duplicated(subset=['abstract'])])}")
titleの重複数: 0
abstractの重複数: 0

title

# 単語数
train_df["title_num_words"] = train_df["title"].apply(lambda x: len(x.split()))
test_df["title_num_words"] = test_df["title"].apply(lambda x: len(x.split()))
plt.hist(train_df["title_num_words"], density=True, alpha=0.6, label="train")
plt.hist(test_df["title_num_words"], density=True, alpha=0.4, label="test")

plt.legend()
plt.show();
plt.hist(train_df.loc[train_df["y"]==1, "title_num_words"], density=True, alpha=0.6, label="y=1")
plt.hist(train_df.loc[train_df["y"]==0, "title_num_words"], density=True, alpha=0.4, label="y=0")

plt.legend()
plt.show();
# 単語出現頻度
def remove_stopword(x):
    stopwords_list = set(stopwords.words("english") + list(string.punctuation))
    return [y for y in x if y not in stopwords_list]


def show_most_common_words(train_df, test_df, col, top_k=20):
    # トークン化、ストップワードの除去
    for df in [train_df, test_df]:
        df[f"{col}_word_list"] = df[col].apply(lambda x: nltk.word_tokenize(x.lower()))
        df[f"{col}_word_list"] = df[f"{col}_word_list"].apply(lambda x: remove_stopword(x))

    stemmer = stem.PorterStemmer()
    word_df = None

    # all
    counter_tr = collections.Counter([stemmer.stem(item) for sublist in train_df[f"{col}_word_list"] for item in sublist])
    word_df = pd.DataFrame(counter_tr.most_common(top_k))
    word_df.columns = ["[tr all]words", "[tr all]count"]

    # y = 1
    counter_tr = collections.Counter([
        stemmer.stem(item) for sublist in train_df.loc[train_df["y"]==1, f"{col}_word_list"] for item in sublist
    ])
    tmp_df = pd.DataFrame(counter_tr.most_common(top_k))
    tmp_df.columns = ["[tr y=1]words", "[tr y=1]count"]
    word_df = pd.concat([word_df, tmp_df], axis=1)

    # y = 0
    counter_tr = collections.Counter([
        stemmer.stem(item) for sublist in train_df.loc[train_df["y"]==0, f"{col}_word_list"] for item in sublist
    ])
    tmp_df = pd.DataFrame(counter_tr.most_common(top_k))
    tmp_df.columns = ["[tr y=0]words", "[tr y=0]count"]
    word_df = pd.concat([word_df, tmp_df], axis=1)

    # all
    counter_te = collections.Counter([stemmer.stem(item) for sublist in test_df[f"{col}_word_list"] for item in sublist])
    tmp_df = pd.DataFrame(counter_te.most_common(top_k))
    tmp_df.columns = ["[te all]words", "[te all]count"]
    word_df = pd.concat([word_df, tmp_df], axis=1)

    return word_df
show_most_common_words(train_df, test_df, "title", top_k=20)
[tr all]words [tr all]count [tr y=1]words [tr y=1]count [tr y=0]words [tr y=0]count [te all]words [te all]count
0 learn 1540 learn 480 learn 1060 learn 2136
1 network 1166 network 361 network 805 network 1016
2 neural 884 neural 298 neural 586 neural 879
3 deep 663 deep 186 deep 477 model 661
4 gener 572 gener 174 gener 398 gener 606
5 model 473 model 140 model 333 deep 509
6 adversari 402 adversari 118 adversari 284 via 464
7 via 327 via 97 via 230 represent 441
8 represent 308 represent 90 represent 218 graph 416
9 train 289 train 86 train 203 reinforc 382
10 reinforc 280 reinforc 83 reinforc 197 optim 357
11 graph 260 graph 72 graph 188 adversari 351
12 use 240 optim 69 use 182 robust 324
13 optim 207 use 58 optim 138 train 310
14 robust 186 robust 58 convolut 133 data 293
15 convolut 179 gradient 53 robust 128 use 238
16 adapt 166 effici 49 adapt 119 effici 237
17 gradient 158 adapt 47 embed 115 adapt 226
18 embed 148 polici 47 data 110 transform 222
19 data 148 convolut 46 gradient 105 improv 205

最頻出単語の上位20件を見てみると、採択されたかどうかで頻出単語に差はあまり無さそう

year

print("-"*10 + " train " + "-"*10)

for year in [2018, 2019, 2020]:
    tmp_df = train_df.loc[train_df["year"] == year]
    y0 = len(tmp_df.loc[tmp_df['y']==0])
    y1 = len(tmp_df.loc[tmp_df['y']==1])
    text = f"{year} [all] {len(tmp_df):4d}({len(tmp_df)*100/len(train_df):.1f}%)"
    text += f" [y=0] {y0:4d}({y0*100/(y0+y1):.1f}%)"
    text += f" [y=1] {y1:4d}({y0*100/(y0+y1):.1f}%)"
    print(text)

print()
print("-"*10 + " test  " + "-"*10)

for year in [2021, 2022]:
    tmp_df = test_df.loc[test_df["year"] == year]
    print(f"{year} [all] {len(tmp_df):4d}({len(tmp_df)*100/len(test_df):.1f}%)")
---------- train ----------
2018 [all]  822(16.5%) [y=0]  486(59.1%) [y=1]  336(59.1%)
2019 [all] 1566(31.5%) [y=0] 1064(67.9%) [y=1]  502(67.9%)
2020 [all] 2586(52.0%) [y=0] 1901(73.5%) [y=1]  685(73.5%)

---------- test  ----------
2021 [all] 3003(47.0%)
2022 [all] 3390(53.0%)

年々論文数は増えていっているが、一方で採択の割合は減少している

abstract

# 単語数
train_df["abstract_num_words"] = train_df["abstract"].apply(lambda x: len(x.split()))
test_df["abstract_num_words"] = test_df["abstract"].apply(lambda x: len(x.split()))
plt.hist(train_df["abstract_num_words"], density=True, alpha=0.6, label="train")
plt.hist(test_df["abstract_num_words"], density=True, alpha=0.4, label="test")

plt.legend()
plt.show();
plt.hist(train_df.loc[train_df["y"]==1, "abstract_num_words"], density=True, alpha=0.6, label="y=1")
plt.hist(train_df.loc[train_df["y"]==0, "abstract_num_words"], density=True, alpha=0.4, label="y=0")

plt.legend()
plt.show();
# 単語出現頻度
show_most_common_words(train_df, test_df, "abstract", top_k=20)
[tr all]words [tr all]count [tr y=1]words [tr y=1]count [tr y=0]words [tr y=0]count [te all]words [te all]count
0 learn 8684 learn 2728 learn 5956 learn 11665
1 model 8224 model 2402 model 5822 model 11450
2 network 6743 network 2142 network 4601 method 7154
3 train 5497 train 1716 train 3781 train 7131
4 gener 5007 use 1466 gener 3558 network 6311
5 use 4915 gener 1449 use 3449 gener 6071
6 method 4797 method 1397 method 3400 propos 6058
7 propos 4479 neural 1221 propos 3280 data 6033
8 task 3812 propos 1199 data 2781 use 5848
9 neural 3798 task 1187 task 2625 task 5123
10 data 3721 show 1058 neural 2577 perform 5071
11 show 3369 perform 961 perform 2350 show 4412
12 perform 3311 data 940 show 2311 neural 4053
13 approach 2716 algorithm 862 approach 1898 approach 3487
14 deep 2564 approach 818 deep 1789 algorithm 3369
15 result 2483 deep 775 result 1750 problem 3259
16 algorithm 2474 problem 759 problem 1680 represent 3249
17 problem 2439 result 733 represent 1640 result 3241
18 represent 2306 optim 677 algorithm 1612 dataset 3183
19 imag 2230 represent 666 imag 1606 optim 3099

最頻出単語の上位20件を見てみると、採択されたかどうかで頻出単語に差はあまり無さそう

keywords

for df in [train_df, test_df]:
    df["keywords"] = df["keywords"].fillna("").astype(str)
    df["keywords_count"] = df["keywords"].str.split(", ").agg(len)
    df.loc[df["keywords"]=="", "keywords_count"] = 0
plt.hist(train_df["keywords_count"], density=True, alpha=0.6, label="train")
plt.hist(test_df["keywords_count"], density=True, alpha=0.4, label="test")

plt.legend()
plt.show();
plt.hist(train_df.loc[train_df["y"]==1, "keywords_count"], density=True, alpha=0.6, label="y=1")
plt.hist(train_df.loc[train_df["y"]==0, "keywords_count"], density=True, alpha=0.4, label="y=0")

plt.legend()
plt.show();
all_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
print(all_df.shape)
(11367, 11)
# すべて小文字にして、ハイフンを半角スペースに変換
all_df["keywords"] = all_df["keywords"].str.lower().str.replace("-", " ")

# 複数形の単語リスト
plural_words = [
    "models", "networks", "embeddings", "graphs", "gans", "rnns", "parameters",
    "functions", "representations", "methods", "images", "tests", "algorithms", "names",
    "records", "attributes", "coders", "recommendations", "orders", "gradients", "tasks",
    "machines", "operations", "examples"
]

keyword_list = []
for idx in all_df.index:
    if len(all_df.loc[idx, "keywords"]) == 0:
        continue
    tmp_list = all_df.loc[idx, "keywords"].split(", ")
    for plural_word in plural_words:
        tmp_list = [word.replace(plural_word, plural_word[:-1]) for word in tmp_list]
    keyword_list += tmp_list

print(len(keyword_list))
39230
counter = collections.Counter(keyword_list)
# keywordの単語数
print(len(counter))
12945
pprint(counter.most_common(20))
[('deep learning', 1129),
 ('reinforcement learning', 922),
 ('representation learning', 420),
 ('graph neural network', 350),
 ('neural network', 332),
 ('generative model', 297),
 ('meta learning', 283),
 ('generalization', 240),
 ('unsupervised learning', 228),
 ('robustness', 218),
 ('generative adversarial network', 216),
 ('gan', 213),
 ('optimization', 207),
 ('natural language processing', 206),
 ('transfer learning', 202),
 ('self supervised learning', 194),
 ('deep reinforcement learning', 187),
 ('interpretability', 184),
 ('adversarial example', 182),
 ('computer vision', 176)]

深層学習や強化学習など一般的なタグが多く含まれていることがわかる

# 年別にキーワードが変化しているか確認する
keyword_list_year = {}
counter_year = {}

years = [2018, 2019, 2020, 2021, 2022]

for year in years:
    tmp_df = all_df.loc[all_df["year"] == year].reset_index(drop=True)

    tmp_keyword_list = []
    for idx in tmp_df.index:
        if len(tmp_df.loc[idx, "keywords"]) == 0:
            continue
        tmp_list = tmp_df.loc[idx, "keywords"].split(", ")
        for plural_word in plural_words:
            tmp_list = [word.replace(plural_word, plural_word[:-1]) for word in tmp_list]
        tmp_keyword_list += tmp_list

    keyword_list_year[year] = tmp_keyword_list
    counter_year[year] = collections.Counter(tmp_keyword_list)

    print(f"{year} len(keyword_list_year)={len(keyword_list_year[year]):5d} len(counter_year)={len(counter_year[year]):5d}")
2018 len(keyword_list_year)= 2984 len(counter_year)= 1555
2019 len(keyword_list_year)= 5790 len(counter_year)= 2827
2020 len(keyword_list_year)= 8832 len(counter_year)= 4044
2021 len(keyword_list_year)=10348 len(counter_year)= 4682
2022 len(keyword_list_year)=11276 len(counter_year)= 5343
keywords_df = None
for year in years:
    if keywords_df is None:
        keywords_df = pd.DataFrame(counter_year[year].most_common(20))
        keywords_df.columns = [f"[{year}]keyword", f"[{year}]count"]
    else:
        tmp_df = pd.DataFrame(counter_year[year].most_common(20))
        tmp_df.columns = [f"[{year}]keyword", f"[{year}]count"]
        keywords_df = pd.concat([keywords_df, tmp_df], axis=1)

display(keywords_df)
[2018]keyword [2018]count [2019]keyword [2019]count [2020]keyword [2020]count [2021]keyword [2021]count [2022]keyword [2022]count
0 deep learning 146 deep learning 226 deep learning 300 deep learning 256 reinforcement learning 235
1 reinforcement learning 81 reinforcement learning 151 reinforcement learning 237 reinforcement learning 218 deep learning 201
2 neural network 52 generative model 67 representation learning 101 representation learning 130 graph neural network 130
3 generative adversarial network 37 neural network 63 graph neural network 78 graph neural network 116 representation learning 120
4 gan 36 generative adversarial network 55 generative model 75 meta learning 90 self supervised learning 93
5 generative model 34 unsupervised learning 54 neural network 73 robustness 79 federated learning 81
6 unsupervised learning 32 optimization 53 meta learning 71 neural network 75 robustness 71
7 recurrent neural network 28 representation learning 51 natural language processing 60 self supervised learning 70 neural network 69
8 rnn 24 gan 50 adversarial example 55 generalization 64 generalization 66
9 optimization 23 meta learning 50 generalization 54 unsupervised learning 62 transformer 66
10 deep reinforcement learning 23 adversarial example 44 gan 54 generative model 61 contrastive learning 62
11 natural language processing 23 convolutional neural network 41 transfer learning 53 interpretability 52 generative model 60
12 adversarial example 23 generalization 38 deep reinforcement learning 50 natural language processing 51 meta learning 59
13 representation learning 18 natural language processing 34 generative adversarial network 49 transfer learning 51 computer vision 56
14 generalization 18 machine learning 31 optimization 48 few shot learning 50 continual learning 53
15 convolutional neural network 17 variational inference 31 robustness 47 contrastive learning 50 transfer learning 51
16 transfer learning 16 transfer learning 31 unsupervised learning 47 gan 49 interpretability 49
17 lstm 15 deep reinforcement learning 29 interpretability 45 deep reinforcement learning 48 machine learning 48
18 variational inference 15 recurrent neural network 28 computer vision 41 generative adversarial network 48 adversarial training 46
19 computer vision 15 interpretability 27 semi supervised learning 38 neural architecture search 43 adversarial robustness 43

2018年はGANやRNN、LSTMなど古典的なキーワードが上位に入ってきているが、2022年はGNNやfederated learning、transformerなどが上位に入ってきている

分布の確認

display(train_df["y"].value_counts() / len(train_df))
0    0.693808
1    0.306192
Name: y, dtype: float64

採択の割合は約3割。予測値をすべて0にしてサブミットするとPublic LBは0.7016となることから、テストデータでも採択の割合はほぼほぼ同じ

添付データ

  • EDA.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241221T124917Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。