GNNRecom/gnnrec/kgrec/data/preprocess/utils.py

24 lines
873 B
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import tempfile
import zipfile
from gnnrec.kgrec.utils import iter_json
def iter_lines(raw_path, data_type):
"""依次迭代OAG数据集某种类型数据所有txt文件的每一行并将JSON解析为字典
:param raw_path: str 原始zip文件所在目录
:param data_type: str 数据类型author, paper, venue, affiliation之一
:return: Iterable[dict]
"""
with tempfile.TemporaryDirectory() as tmp:
for zip_file in os.listdir(raw_path):
if zip_file.startswith(f'mag_{data_type}s'):
with zipfile.ZipFile(os.path.join(raw_path, zip_file)) as z:
for txt in z.namelist():
print(f'{zip_file}\\{txt}')
txt_file = z.extract(txt, tmp)
yield from iter_json(txt_file)
os.remove(txt_file)