論文の被引用数予測

研究価値を最大化させるワーディングとは

賞金: 100,000 参加ユーザー数: 182 約3年前に終了

train/test データ読み込み性能比較

データ読み込み性能比較 in json/pickle/csv format

import pandas as pd
import json
import cudf

from pathlib import Path
from tqdm import tqdm

Utilities

import psutil
import time
import sys
import os
import math
from contextlib import contextmanager

@contextmanager
def timer(name: str):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    try:
        yield
    finally:
        m1 = p.memory_info()[0] / 2. ** 30
        delta = m1 - m0
        sign = '+' if delta >= 0 else '-'
        delta = math.fabs(delta)
        print(f"[{m1:.1f}GB({sign}{delta:.1f}GB): {time.time() - t0:.3f}sec] {name}", file=sys.stderr)
        
def get_data_iter(fpath):
    with open(fpath, 'r') as f:
        for l in f:
            yield l

Define PATH

DATA_PATH = Path("../input/Predicting_the_number_of_citations_to_a_paper/")
FEATURE_PATH = Path("../features/")
FEATURE_PATH.mkdir(parents=True, exist_ok=True)

Convert json to pickle, csv

columns = ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref',
           'doi', 'report-no', 'categories', 'license', 'abstract', 'versions',
           'update_date', 'authors_parsed', 'doi_cites']
TARGET = 'cites'

tmp = []
train_iter = get_data_iter(DATA_PATH / "train_data.json")
for line in tqdm(train_iter, total=851_524):
    dict=json.loads(line)
    tmp.append(list(dict.values()))

train_df = pd.DataFrame(tmp, columns=columns+[TARGET])
train_df.to_pickle(FEATURE_PATH / "train.pickle")
train_df.to_csv(FEATURE_PATH / "train.csv", index=False)

tmp = []
test_iter = get_data_iter(DATA_PATH / "test_data.json")
for line in tqdm(test_iter, total=59_084):
    tmp.append(list(json.loads(line).values()))

test_df = pd.DataFrame(tmp, columns=columns)
test_df.to_pickle(FEATURE_PATH / "test.pickle")
test_df.to_csv(FEATURE_PATH / "test.csv", index=False)
100%|██████████| 851524/851524 [00:18<00:00, 46858.68it/s]
100%|██████████| 59084/59084 [00:02<00:00, 22387.21it/s]

Read train/test data

# json -> DataFrame
with timer("Read train/test json data"):
    train_jsdf = pd.read_json(DATA_PATH / "train_data.json", lines=True)
    test_jsdf = pd.read_json(DATA_PATH / "test_data.json", lines=True)
[10.3GB(+6.2GB): 23.859sec] Read train/test json data
# csv -> DataFrame
with timer("Read train/test csv data"):
    train_df = pd.read_csv(FEATURE_PATH / "train.csv")
    test_df = pd.read_csv(FEATURE_PATH / "test.csv")
[10.9GB(+0.6GB): 10.717sec] Read train/test csv data
# pickle -> DataFrame
with timer("Read train/test pickle data"):
    train_pkdf = pd.read_pickle(FEATURE_PATH / "train.pickle")
    test_pkdf = pd.read_pickle(FEATURE_PATH / "test.pickle")
[10.9GB(+0.0GB): 10.191sec] Read train/test pickle data
# csv -> DataFrame by cudf
with timer("Read train/test csv data by cudf"):
    train_cudf = cudf.read_csv(FEATURE_PATH / "train.csv").to_pandas()
    test_cudf = cudf.read_csv(FEATURE_PATH / "test.csv").to_pandas()
[13.9GB(+3.0GB): 6.307sec] Read train/test csv data by cudf

添付データ

  • perf_comp.ipynb?X-Amz-Expires=10800&X-Amz-Date=20240423T140043Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。