退会したユーザー
import pandas as pd
import numpy as np
import os
import sys
import json
import pickle
from pathlib import Path
from tqdm import tqdm
DATA_PATH = Path("../input/Predicting_the_number_of_citations_to_a_paper/")
FEATURE_PATH = Path("../features/")
FEATURE_PATH.mkdir(parents=True, exist_ok=True)
def get_data_iter(fpath):
with open(fpath, 'r') as f:
for l in f:
yield l
columns = ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref',
'doi', 'report-no', 'categories', 'license', 'abstract', 'versions',
'update_date', 'authors_parsed', 'doi_cites']
TARGET = 'cites'
tmp = []
train_iter = get_data_iter(DATA_PATH / "train_data.json")
for line in tqdm(train_iter, total=851_524):
dict=json.loads(line)
tmp.append(list(dict.values()))
train_df = pd.DataFrame(tmp, columns=columns+[TARGET])
with open(FEATURE_PATH / "train.pickle", 'wb') as f:
pickle.dump(train_df, f)
tmp = []
test_iter = get_data_iter(DATA_PATH / "test_data.json")
for line in tqdm(test_iter, total=59_084):
tmp.append(list(json.loads(line).values()))
test_df = pd.DataFrame(tmp, columns=columns)
with open(FEATURE_PATH / "test.pickle", 'wb') as f:
pickle.dump(test_df, f)
100%|██████████| 851524/851524 [00:33<00:00, 25126.47it/s] 100%|██████████| 59084/59084 [00:01<00:00, 46464.33it/s]
display(train_df.shape)
display(train_df.head())
(851524, 16)
id | submitter | authors | title | comments | journal-ref | doi | report-no | categories | license | abstract | versions | update_date | authors_parsed | doi_cites | cites | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | hep-ph/9902295 | Michael Kraemer | Mark E. Hayes (University College London) and ... | Heavy-Flavour Production at HERA | LaTeX, 21 pages, 13 Postscript figures. Summar... | J.Phys.G25:1477-1493,1999 | 10.1088/0954-3899/25/7/332 | CERN-TH/99-30, UCL/HEP 99-03 | hep-ph hep-ex | None | We review the theoretical and experimental s... | [{'version': 'v1', 'created': 'Wed, 10 Feb 199... | 2008-11-26 | [[Hayes, Mark E., , University College London]... | 1 | NaN |
1 | 1403.7138 | Aigen Li | Qi Li, S.L. Liang, Aigen Li (University of Mis... | Spectropolarimetric Constraints on the Nature ... | 5 pages, 2 figures; accepted for publication i... | None | 10.1093/mnrasl/slu021 | None | astro-ph.GA | http://arxiv.org/licenses/nonexclusive-distrib... | While it is well recognized that interstella... | [{'version': 'v1', 'created': 'Thu, 27 Mar 201... | 2015-06-19 | [[Li, Qi, , University of Missouri], [Liang, S... | 8 | 7.0 |
2 | 1405.5857 | Michael Mortonson | Michael J. Mortonson, Uro\v{s} Seljak | A joint analysis of Planck and BICEP2 B modes ... | 13 pages, 4 figures; submitted to JCAP; refere... | JCAP10(2014)035 | 10.1088/1475-7516/2014/10/035 | None | astro-ph.CO gr-qc hep-ph hep-th | http://arxiv.org/licenses/nonexclusive-distrib... | We analyze BICEP2 and Planck data using a mo... | [{'version': 'v1', 'created': 'Thu, 22 May 201... | 2014-10-17 | [[Mortonson, Michael J., ], [Seljak, Uroš, ]] | 122 | 188.0 |
3 | 1807.01034 | Evangelos Thomas Karamatskos | Evangelos T. Karamatskos, Sebastian Raabe, Ter... | Molecular movie of ultrafast coherent rotation... | 9 Figures | Nat Commun 10, 3364 (2019) | 10.1038/s41467-019-11122-y | None | physics.chem-ph physics.atom-ph quant-ph | http://arxiv.org/licenses/nonexclusive-distrib... | Recording molecular movies on ultrafast time... | [{'version': 'v1', 'created': 'Tue, 3 Jul 2018... | 2020-05-19 | [[Karamatskos, Evangelos T., ], [Raabe, Sebast... | 6 | 8.0 |
4 | 1905.05921 | Juanjuan Gu | Juanjuan Gu and Yun Jing | A Modified Mixed Domain Method for Modeling Ac... | None | None | 10.1121/10.0001454 | None | physics.med-ph physics.comp-ph | http://arxiv.org/licenses/nonexclusive-distrib... | In this paper, phase correction and amplitud... | [{'version': 'v1', 'created': 'Wed, 15 May 201... | 2020-07-15 | [[Gu, Juanjuan, ], [Jing, Yun, ]] | 0 | NaN |
display(test_df.shape)
display(test_df.head())
(59084, 15)
id | submitter | authors | title | comments | journal-ref | doi | report-no | categories | license | abstract | versions | update_date | authors_parsed | doi_cites | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1605.00995 | Simonetta Abenda | Simonetta Abenda | On a family of KP multi-line solitons associat... | 48 pages, 5 figures. Revised manuscript. Added... | J. Geom. Phys. 119 (2017), 112-138 | 10.1016/j.geomphys.2017.04.005 | None | math-ph math.MP | http://arxiv.org/licenses/nonexclusive-distrib... | We classify the soliton data in the totally ... | [{'version': 'v1', 'created': 'Tue, 3 May 2016... | 2019-06-27 | [[Abenda, Simonetta, ]] | 5 |
1 | 1206.6911 | Hanqing Zheng | L. Y. Dai, Meng Shi, Guang-Yi Tang, H. Q. Zheng | On the Nature of X(4260) | Refined analysis with new experimental data in... | Phys. Rev. D 92, 014020 (2015) | 10.1103/PhysRevD.92.014020 | None | hep-ph hep-ex | http://arxiv.org/licenses/nonexclusive-distrib... | We study the property of $X(4260)$ resonance... | [{'version': 'v1', 'created': 'Thu, 28 Jun 201... | 2015-07-22 | [[Dai, L. Y., ], [Shi, Meng, ], [Tang, Guang-Y... | 23 |
2 | cond-mat/0504055 | Haim Diamant | B. Lin, M. Meron, B. Cui, S. A. Rice, H. Diamant | From random walk to single-file diffusion | 4 pages, 4 figures | Phys Rev Lett 94, 216001 (2005) | 10.1103/PhysRevLett.94.216001 | None | cond-mat.soft cond-mat.mtrl-sci physics.chem-ph | None | We report an experimental study of diffusion... | [{'version': 'v1', 'created': 'Sun, 3 Apr 2005... | 2007-05-23 | [[Lin, B., ], [Meron, M., ], [Cui, B., ], [Ric... | 93 |
3 | astro-ph/9907297 | Tod E. Strohmayer | Tod E. Strohmayer | Spin Down of Pulsations in the Cooling Tail of... | 16 pages, AASTEX preprint with 7 embedded figu... | None | 10.1086/312258 | None | astro-ph | None | We report the discovery with the proportiona... | [{'version': 'v1', 'created': 'Wed, 21 Jul 199... | 2009-10-31 | [[Strohmayer, Tod E., ]] | 24 |
4 | 1104.5407 | Lie-Wen Chen | Lie-Wen Chen, Jian-Zhong Gu | Correlations between the nuclear breathing mod... | 9 pages, 6 figures. Discussions and references... | J.Phys.G39:035104,2012 | 10.1088/0954-3899/39/3/035104 | None | nucl-th astro-ph.SR nucl-ex | http://arxiv.org/licenses/nonexclusive-distrib... | Based on microscopic Hartree-Fock + random p... | [{'version': 'v1', 'created': 'Thu, 28 Apr 201... | 2012-03-27 | [[Chen, Lie-Wen, ], [Gu, Jian-Zhong, ]] | 12 |
szdr
読み込みは、pandasのread_jsonを利用すると簡単かもしれません https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html