GNNRecom/gnnrec/kgrec/data/preprocess/utils.py

import os
import tempfile
import zipfile

from gnnrec.kgrec.utils import iter_json


def iter_lines(raw_path, data_type):
    """依次迭代OAG数据集某种类型数据所有txt文件的每一行并将JSON解析为字典

    :param raw_path: str 原始zip文件所在目录
    :param data_type: str 数据类型，author, paper, venue, affiliation之一
    :return: Iterable[dict]
    """
    with tempfile.TemporaryDirectory() as tmp:
        for zip_file in os.listdir(raw_path):
            if zip_file.startswith(f'mag_{data_type}s'):
                with zipfile.ZipFile(os.path.join(raw_path, zip_file)) as z:
                    for txt in z.namelist():
                        print(f'{zip_file}\\{txt}')
                        txt_file = z.extract(txt, tmp)
                        yield from iter_json(txt_file)
                        os.remove(txt_file)