skywalker
import warnings warnings.filterwarnings('ignore') import pandas as pd import matplotlib.pyplot as plt import japanize_matplotlib from matplotlib_venn import venn2 import os from tqdm import tqdm_notebook as tqdm
INPUT_DIR = './raw/' Train = pd.read_csv(INPUT_DIR + 'train_data.csv') Test = pd.read_csv(INPUT_DIR + 'test_data.csv') print('Train:', Train.shape, ' Test:', Test.shape)
Train: (356344, 28) Test: (34844, 27)
def draw_venn(df1, df2): df1_col = df1.columns.tolist() df2_col = df2.columns.tolist() # 短いほうを使う if len(df1_col) > len(df2_col): df_col = df2_col else: df_col = df1_col plt.figure(figsize=(20,20), facecolor='w') c = 4 r = (len(df_col) // c) + 1 for i, col in tqdm(enumerate(df_col)): plt.subplot(r, c, i+1) s1 = set(df1[col].unique().tolist()) s2 = set(df2[col].unique().tolist()) venn2(subsets=[s1, s2], set_labels=['Train', 'Test']) plt.title(str(col), fontsize=14) plt.savefig('venn.png', bbox_inches='tight') plt.show() return df_col
df_col = draw_venn(Train, Test)
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))