24 lines
873 B
Python
24 lines
873 B
Python
|
import os
|
|||
|
import tempfile
|
|||
|
import zipfile
|
|||
|
|
|||
|
from gnnrec.kgrec.utils import iter_json
|
|||
|
|
|||
|
|
|||
|
def iter_lines(raw_path, data_type):
|
|||
|
"""依次迭代OAG数据集某种类型数据所有txt文件的每一行并将JSON解析为字典
|
|||
|
|
|||
|
:param raw_path: str 原始zip文件所在目录
|
|||
|
:param data_type: str 数据类型,author, paper, venue, affiliation之一
|
|||
|
:return: Iterable[dict]
|
|||
|
"""
|
|||
|
with tempfile.TemporaryDirectory() as tmp:
|
|||
|
for zip_file in os.listdir(raw_path):
|
|||
|
if zip_file.startswith(f'mag_{data_type}s'):
|
|||
|
with zipfile.ZipFile(os.path.join(raw_path, zip_file)) as z:
|
|||
|
for txt in z.namelist():
|
|||
|
print(f'{zip_file}\\{txt}')
|
|||
|
txt_file = z.extract(txt, tmp)
|
|||
|
yield from iter_json(txt_file)
|
|||
|
os.remove(txt_file)
|