研究価値を最大化させるワーディングとは
退会したユーザー
import pandas as pd import json import cudf from pathlib import Path from tqdm import tqdm
import psutil import time import sys import os import math from contextlib import contextmanager @contextmanager def timer(name: str): t0 = time.time() p = psutil.Process(os.getpid()) m0 = p.memory_info()[0] / 2. ** 30 try: yield finally: m1 = p.memory_info()[0] / 2. ** 30 delta = m1 - m0 sign = '+' if delta >= 0 else '-' delta = math.fabs(delta) print(f"[{m1:.1f}GB({sign}{delta:.1f}GB): {time.time() - t0:.3f}sec] {name}", file=sys.stderr) def get_data_iter(fpath): with open(fpath, 'r') as f: for l in f: yield l
DATA_PATH = Path("../input/Predicting_the_number_of_citations_to_a_paper/") FEATURE_PATH = Path("../features/") FEATURE_PATH.mkdir(parents=True, exist_ok=True)
columns = ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed', 'doi_cites'] TARGET = 'cites' tmp = [] train_iter = get_data_iter(DATA_PATH / "train_data.json") for line in tqdm(train_iter, total=851_524): dict=json.loads(line) tmp.append(list(dict.values())) train_df = pd.DataFrame(tmp, columns=columns+[TARGET]) train_df.to_pickle(FEATURE_PATH / "train.pickle") train_df.to_csv(FEATURE_PATH / "train.csv", index=False) tmp = [] test_iter = get_data_iter(DATA_PATH / "test_data.json") for line in tqdm(test_iter, total=59_084): tmp.append(list(json.loads(line).values())) test_df = pd.DataFrame(tmp, columns=columns) test_df.to_pickle(FEATURE_PATH / "test.pickle") test_df.to_csv(FEATURE_PATH / "test.csv", index=False)
100%|██████████| 851524/851524 [00:18<00:00, 46858.68it/s] 100%|██████████| 59084/59084 [00:02<00:00, 22387.21it/s]
# json -> DataFrame with timer("Read train/test json data"): train_jsdf = pd.read_json(DATA_PATH / "train_data.json", lines=True) test_jsdf = pd.read_json(DATA_PATH / "test_data.json", lines=True)
[10.3GB(+6.2GB): 23.859sec] Read train/test json data
# csv -> DataFrame with timer("Read train/test csv data"): train_df = pd.read_csv(FEATURE_PATH / "train.csv") test_df = pd.read_csv(FEATURE_PATH / "test.csv")
[10.9GB(+0.6GB): 10.717sec] Read train/test csv data
# pickle -> DataFrame with timer("Read train/test pickle data"): train_pkdf = pd.read_pickle(FEATURE_PATH / "train.pickle") test_pkdf = pd.read_pickle(FEATURE_PATH / "test.pickle")
[10.9GB(+0.0GB): 10.191sec] Read train/test pickle data
# csv -> DataFrame by cudf with timer("Read train/test csv data by cudf"): train_cudf = cudf.read_csv(FEATURE_PATH / "train.csv").to_pandas() test_cudf = cudf.read_csv(FEATURE_PATH / "test.csv").to_pandas()
[13.9GB(+3.0GB): 6.307sec] Read train/test csv data by cudf