yshr10ic
与えられたデータを元に簡単に可視化をしてみました。皆様の参考になれば幸いです。
import collections
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string
from pprint import pprint
import nltk
from nltk import stem
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")
nltk.download("stopwords")
nltk.download("punkt")
[nltk_data] Downloading package stopwords to [nltk_data] /Users/yshr10ic/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /Users/yshr10ic/nltk_data... [nltk_data] Package punkt is already up-to-date!
True
train_df = pd.read_csv("data/train_data.csv")
test_df = pd.read_csv("data/test_data.csv")
all_df = pd.concat([train_df, test_df], axis=1)
print(train_df.shape)
print(test_df.shape)
(4974, 6) (6393, 5)
display(train_df.info())
display(test_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4974 entries, 0 to 4973 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 4974 non-null int64 1 title 4974 non-null object 2 year 4974 non-null int64 3 abstract 4974 non-null object 4 keywords 4494 non-null object 5 y 4974 non-null int64 dtypes: int64(3), object(3) memory usage: 233.3+ KB
None
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6393 entries, 0 to 6392 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 6393 non-null int64 1 title 6393 non-null object 2 year 6393 non-null int64 3 abstract 6393 non-null object 4 keywords 5618 non-null object dtypes: int64(2), object(3) memory usage: 249.9+ KB
None
keywordsのみ欠損値がある
print(f"titleの重複数: {len(all_df[all_df.duplicated(subset=['title'])])}")
print(f"abstractの重複数: {len(all_df[all_df.duplicated(subset=['abstract'])])}")
titleの重複数: 0 abstractの重複数: 0
# 単語数
train_df["title_num_words"] = train_df["title"].apply(lambda x: len(x.split()))
test_df["title_num_words"] = test_df["title"].apply(lambda x: len(x.split()))
plt.hist(train_df["title_num_words"], density=True, alpha=0.6, label="train")
plt.hist(test_df["title_num_words"], density=True, alpha=0.4, label="test")
plt.legend()
plt.show();
plt.hist(train_df.loc[train_df["y"]==1, "title_num_words"], density=True, alpha=0.6, label="y=1")
plt.hist(train_df.loc[train_df["y"]==0, "title_num_words"], density=True, alpha=0.4, label="y=0")
plt.legend()
plt.show();
# 単語出現頻度
def remove_stopword(x):
stopwords_list = set(stopwords.words("english") + list(string.punctuation))
return [y for y in x if y not in stopwords_list]
def show_most_common_words(train_df, test_df, col, top_k=20):
# トークン化、ストップワードの除去
for df in [train_df, test_df]:
df[f"{col}_word_list"] = df[col].apply(lambda x: nltk.word_tokenize(x.lower()))
df[f"{col}_word_list"] = df[f"{col}_word_list"].apply(lambda x: remove_stopword(x))
stemmer = stem.PorterStemmer()
word_df = None
# all
counter_tr = collections.Counter([stemmer.stem(item) for sublist in train_df[f"{col}_word_list"] for item in sublist])
word_df = pd.DataFrame(counter_tr.most_common(top_k))
word_df.columns = ["[tr all]words", "[tr all]count"]
# y = 1
counter_tr = collections.Counter([
stemmer.stem(item) for sublist in train_df.loc[train_df["y"]==1, f"{col}_word_list"] for item in sublist
])
tmp_df = pd.DataFrame(counter_tr.most_common(top_k))
tmp_df.columns = ["[tr y=1]words", "[tr y=1]count"]
word_df = pd.concat([word_df, tmp_df], axis=1)
# y = 0
counter_tr = collections.Counter([
stemmer.stem(item) for sublist in train_df.loc[train_df["y"]==0, f"{col}_word_list"] for item in sublist
])
tmp_df = pd.DataFrame(counter_tr.most_common(top_k))
tmp_df.columns = ["[tr y=0]words", "[tr y=0]count"]
word_df = pd.concat([word_df, tmp_df], axis=1)
# all
counter_te = collections.Counter([stemmer.stem(item) for sublist in test_df[f"{col}_word_list"] for item in sublist])
tmp_df = pd.DataFrame(counter_te.most_common(top_k))
tmp_df.columns = ["[te all]words", "[te all]count"]
word_df = pd.concat([word_df, tmp_df], axis=1)
return word_df
show_most_common_words(train_df, test_df, "title", top_k=20)
[tr all]words | [tr all]count | [tr y=1]words | [tr y=1]count | [tr y=0]words | [tr y=0]count | [te all]words | [te all]count | |
---|---|---|---|---|---|---|---|---|
0 | learn | 1540 | learn | 480 | learn | 1060 | learn | 2136 |
1 | network | 1166 | network | 361 | network | 805 | network | 1016 |
2 | neural | 884 | neural | 298 | neural | 586 | neural | 879 |
3 | deep | 663 | deep | 186 | deep | 477 | model | 661 |
4 | gener | 572 | gener | 174 | gener | 398 | gener | 606 |
5 | model | 473 | model | 140 | model | 333 | deep | 509 |
6 | adversari | 402 | adversari | 118 | adversari | 284 | via | 464 |
7 | via | 327 | via | 97 | via | 230 | represent | 441 |
8 | represent | 308 | represent | 90 | represent | 218 | graph | 416 |
9 | train | 289 | train | 86 | train | 203 | reinforc | 382 |
10 | reinforc | 280 | reinforc | 83 | reinforc | 197 | optim | 357 |
11 | graph | 260 | graph | 72 | graph | 188 | adversari | 351 |
12 | use | 240 | optim | 69 | use | 182 | robust | 324 |
13 | optim | 207 | use | 58 | optim | 138 | train | 310 |
14 | robust | 186 | robust | 58 | convolut | 133 | data | 293 |
15 | convolut | 179 | gradient | 53 | robust | 128 | use | 238 |
16 | adapt | 166 | effici | 49 | adapt | 119 | effici | 237 |
17 | gradient | 158 | adapt | 47 | embed | 115 | adapt | 226 |
18 | embed | 148 | polici | 47 | data | 110 | transform | 222 |
19 | data | 148 | convolut | 46 | gradient | 105 | improv | 205 |
最頻出単語の上位20件を見てみると、採択されたかどうかで頻出単語に差はあまり無さそう
print("-"*10 + " train " + "-"*10)
for year in [2018, 2019, 2020]:
tmp_df = train_df.loc[train_df["year"] == year]
y0 = len(tmp_df.loc[tmp_df['y']==0])
y1 = len(tmp_df.loc[tmp_df['y']==1])
text = f"{year} [all] {len(tmp_df):4d}({len(tmp_df)*100/len(train_df):.1f}%)"
text += f" [y=0] {y0:4d}({y0*100/(y0+y1):.1f}%)"
text += f" [y=1] {y1:4d}({y0*100/(y0+y1):.1f}%)"
print(text)
print()
print("-"*10 + " test " + "-"*10)
for year in [2021, 2022]:
tmp_df = test_df.loc[test_df["year"] == year]
print(f"{year} [all] {len(tmp_df):4d}({len(tmp_df)*100/len(test_df):.1f}%)")
---------- train ---------- 2018 [all] 822(16.5%) [y=0] 486(59.1%) [y=1] 336(59.1%) 2019 [all] 1566(31.5%) [y=0] 1064(67.9%) [y=1] 502(67.9%) 2020 [all] 2586(52.0%) [y=0] 1901(73.5%) [y=1] 685(73.5%) ---------- test ---------- 2021 [all] 3003(47.0%) 2022 [all] 3390(53.0%)
年々論文数は増えていっているが、一方で採択の割合は減少している
# 単語数
train_df["abstract_num_words"] = train_df["abstract"].apply(lambda x: len(x.split()))
test_df["abstract_num_words"] = test_df["abstract"].apply(lambda x: len(x.split()))
plt.hist(train_df["abstract_num_words"], density=True, alpha=0.6, label="train")
plt.hist(test_df["abstract_num_words"], density=True, alpha=0.4, label="test")
plt.legend()
plt.show();
plt.hist(train_df.loc[train_df["y"]==1, "abstract_num_words"], density=True, alpha=0.6, label="y=1")
plt.hist(train_df.loc[train_df["y"]==0, "abstract_num_words"], density=True, alpha=0.4, label="y=0")
plt.legend()
plt.show();
# 単語出現頻度
show_most_common_words(train_df, test_df, "abstract", top_k=20)
[tr all]words | [tr all]count | [tr y=1]words | [tr y=1]count | [tr y=0]words | [tr y=0]count | [te all]words | [te all]count | |
---|---|---|---|---|---|---|---|---|
0 | learn | 8684 | learn | 2728 | learn | 5956 | learn | 11665 |
1 | model | 8224 | model | 2402 | model | 5822 | model | 11450 |
2 | network | 6743 | network | 2142 | network | 4601 | method | 7154 |
3 | train | 5497 | train | 1716 | train | 3781 | train | 7131 |
4 | gener | 5007 | use | 1466 | gener | 3558 | network | 6311 |
5 | use | 4915 | gener | 1449 | use | 3449 | gener | 6071 |
6 | method | 4797 | method | 1397 | method | 3400 | propos | 6058 |
7 | propos | 4479 | neural | 1221 | propos | 3280 | data | 6033 |
8 | task | 3812 | propos | 1199 | data | 2781 | use | 5848 |
9 | neural | 3798 | task | 1187 | task | 2625 | task | 5123 |
10 | data | 3721 | show | 1058 | neural | 2577 | perform | 5071 |
11 | show | 3369 | perform | 961 | perform | 2350 | show | 4412 |
12 | perform | 3311 | data | 940 | show | 2311 | neural | 4053 |
13 | approach | 2716 | algorithm | 862 | approach | 1898 | approach | 3487 |
14 | deep | 2564 | approach | 818 | deep | 1789 | algorithm | 3369 |
15 | result | 2483 | deep | 775 | result | 1750 | problem | 3259 |
16 | algorithm | 2474 | problem | 759 | problem | 1680 | represent | 3249 |
17 | problem | 2439 | result | 733 | represent | 1640 | result | 3241 |
18 | represent | 2306 | optim | 677 | algorithm | 1612 | dataset | 3183 |
19 | imag | 2230 | represent | 666 | imag | 1606 | optim | 3099 |
最頻出単語の上位20件を見てみると、採択されたかどうかで頻出単語に差はあまり無さそう
for df in [train_df, test_df]:
df["keywords"] = df["keywords"].fillna("").astype(str)
df["keywords_count"] = df["keywords"].str.split(", ").agg(len)
df.loc[df["keywords"]=="", "keywords_count"] = 0
plt.hist(train_df["keywords_count"], density=True, alpha=0.6, label="train")
plt.hist(test_df["keywords_count"], density=True, alpha=0.4, label="test")
plt.legend()
plt.show();
plt.hist(train_df.loc[train_df["y"]==1, "keywords_count"], density=True, alpha=0.6, label="y=1")
plt.hist(train_df.loc[train_df["y"]==0, "keywords_count"], density=True, alpha=0.4, label="y=0")
plt.legend()
plt.show();
all_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
print(all_df.shape)
(11367, 11)
# すべて小文字にして、ハイフンを半角スペースに変換
all_df["keywords"] = all_df["keywords"].str.lower().str.replace("-", " ")
# 複数形の単語リスト
plural_words = [
"models", "networks", "embeddings", "graphs", "gans", "rnns", "parameters",
"functions", "representations", "methods", "images", "tests", "algorithms", "names",
"records", "attributes", "coders", "recommendations", "orders", "gradients", "tasks",
"machines", "operations", "examples"
]
keyword_list = []
for idx in all_df.index:
if len(all_df.loc[idx, "keywords"]) == 0:
continue
tmp_list = all_df.loc[idx, "keywords"].split(", ")
for plural_word in plural_words:
tmp_list = [word.replace(plural_word, plural_word[:-1]) for word in tmp_list]
keyword_list += tmp_list
print(len(keyword_list))
39230
counter = collections.Counter(keyword_list)
# keywordの単語数
print(len(counter))
12945
pprint(counter.most_common(20))
[('deep learning', 1129), ('reinforcement learning', 922), ('representation learning', 420), ('graph neural network', 350), ('neural network', 332), ('generative model', 297), ('meta learning', 283), ('generalization', 240), ('unsupervised learning', 228), ('robustness', 218), ('generative adversarial network', 216), ('gan', 213), ('optimization', 207), ('natural language processing', 206), ('transfer learning', 202), ('self supervised learning', 194), ('deep reinforcement learning', 187), ('interpretability', 184), ('adversarial example', 182), ('computer vision', 176)]
深層学習や強化学習など一般的なタグが多く含まれていることがわかる
# 年別にキーワードが変化しているか確認する
keyword_list_year = {}
counter_year = {}
years = [2018, 2019, 2020, 2021, 2022]
for year in years:
tmp_df = all_df.loc[all_df["year"] == year].reset_index(drop=True)
tmp_keyword_list = []
for idx in tmp_df.index:
if len(tmp_df.loc[idx, "keywords"]) == 0:
continue
tmp_list = tmp_df.loc[idx, "keywords"].split(", ")
for plural_word in plural_words:
tmp_list = [word.replace(plural_word, plural_word[:-1]) for word in tmp_list]
tmp_keyword_list += tmp_list
keyword_list_year[year] = tmp_keyword_list
counter_year[year] = collections.Counter(tmp_keyword_list)
print(f"{year} len(keyword_list_year)={len(keyword_list_year[year]):5d} len(counter_year)={len(counter_year[year]):5d}")
2018 len(keyword_list_year)= 2984 len(counter_year)= 1555 2019 len(keyword_list_year)= 5790 len(counter_year)= 2827 2020 len(keyword_list_year)= 8832 len(counter_year)= 4044 2021 len(keyword_list_year)=10348 len(counter_year)= 4682 2022 len(keyword_list_year)=11276 len(counter_year)= 5343
keywords_df = None
for year in years:
if keywords_df is None:
keywords_df = pd.DataFrame(counter_year[year].most_common(20))
keywords_df.columns = [f"[{year}]keyword", f"[{year}]count"]
else:
tmp_df = pd.DataFrame(counter_year[year].most_common(20))
tmp_df.columns = [f"[{year}]keyword", f"[{year}]count"]
keywords_df = pd.concat([keywords_df, tmp_df], axis=1)
display(keywords_df)
[2018]keyword | [2018]count | [2019]keyword | [2019]count | [2020]keyword | [2020]count | [2021]keyword | [2021]count | [2022]keyword | [2022]count | |
---|---|---|---|---|---|---|---|---|---|---|
0 | deep learning | 146 | deep learning | 226 | deep learning | 300 | deep learning | 256 | reinforcement learning | 235 |
1 | reinforcement learning | 81 | reinforcement learning | 151 | reinforcement learning | 237 | reinforcement learning | 218 | deep learning | 201 |
2 | neural network | 52 | generative model | 67 | representation learning | 101 | representation learning | 130 | graph neural network | 130 |
3 | generative adversarial network | 37 | neural network | 63 | graph neural network | 78 | graph neural network | 116 | representation learning | 120 |
4 | gan | 36 | generative adversarial network | 55 | generative model | 75 | meta learning | 90 | self supervised learning | 93 |
5 | generative model | 34 | unsupervised learning | 54 | neural network | 73 | robustness | 79 | federated learning | 81 |
6 | unsupervised learning | 32 | optimization | 53 | meta learning | 71 | neural network | 75 | robustness | 71 |
7 | recurrent neural network | 28 | representation learning | 51 | natural language processing | 60 | self supervised learning | 70 | neural network | 69 |
8 | rnn | 24 | gan | 50 | adversarial example | 55 | generalization | 64 | generalization | 66 |
9 | optimization | 23 | meta learning | 50 | generalization | 54 | unsupervised learning | 62 | transformer | 66 |
10 | deep reinforcement learning | 23 | adversarial example | 44 | gan | 54 | generative model | 61 | contrastive learning | 62 |
11 | natural language processing | 23 | convolutional neural network | 41 | transfer learning | 53 | interpretability | 52 | generative model | 60 |
12 | adversarial example | 23 | generalization | 38 | deep reinforcement learning | 50 | natural language processing | 51 | meta learning | 59 |
13 | representation learning | 18 | natural language processing | 34 | generative adversarial network | 49 | transfer learning | 51 | computer vision | 56 |
14 | generalization | 18 | machine learning | 31 | optimization | 48 | few shot learning | 50 | continual learning | 53 |
15 | convolutional neural network | 17 | variational inference | 31 | robustness | 47 | contrastive learning | 50 | transfer learning | 51 |
16 | transfer learning | 16 | transfer learning | 31 | unsupervised learning | 47 | gan | 49 | interpretability | 49 |
17 | lstm | 15 | deep reinforcement learning | 29 | interpretability | 45 | deep reinforcement learning | 48 | machine learning | 48 |
18 | variational inference | 15 | recurrent neural network | 28 | computer vision | 41 | generative adversarial network | 48 | adversarial training | 46 |
19 | computer vision | 15 | interpretability | 27 | semi supervised learning | 38 | neural architecture search | 43 | adversarial robustness | 43 |
2018年はGANやRNN、LSTMなど古典的なキーワードが上位に入ってきているが、2022年はGNNやfederated learning、transformerなどが上位に入ってきている
display(train_df["y"].value_counts() / len(train_df))
0 0.693808 1 0.306192 Name: y, dtype: float64
採択の割合は約3割。予測値をすべて0にしてサブミットするとPublic LBは0.7016となることから、テストデータでも採択の割合はほぼほぼ同じ