GNNRecom/gnnrec/kgrec/data/preprocess/ai2000_crawler.py

73 lines
2.5 KiB
Python

import json
import os
from collections import defaultdict
import scrapy
from itemadapter import ItemAdapter
class ScholarItem(scrapy.Item):
name = scrapy.Field()
org = scrapy.Field()
field = scrapy.Field()
rank = scrapy.Field()
class AI2000Spider(scrapy.Spider):
name = 'ai2000'
allowed_domains = ['aminer.cn']
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
},
'DOWNLOAD_DELAY': 20,
'ITEM_PIPELINES': {'ai2000_crawler.JsonWriterPipeline': 0}
}
def __init__(self, save_path, *args, **kwargs):
super().__init__(*args, **kwargs)
self.save_path = save_path
def start_requests(self):
return [scrapy.Request(
'https://apiv2.aminer.cn/magic?a=__mostinfluentialscholars.GetDomainList___',
callback=self.parse_domain_list, method='POST',
body='[{"action":"mostinfluentialscholars.GetDomainList","parameters":{"year":2019}}]'
)]
def parse_domain_list(self, response):
domains = json.loads(response.body)['data'][0]['item']
body_fmt = '[{"action":"ai2000v2.GetDomainTopScholars","parameters":{"year_filter":2020,"domain":"%s","top_n":100,"type":"AI 2000"}}]'
for domain in domains:
yield scrapy.Request(
'https://apiv2.aminer.cn/magic?a=__ai2000v2.GetDomainTopScholars___',
method='POST', body=body_fmt % domain['id'],
cb_kwargs={'domain_name': domain['name']}
)
def parse(self, response, **kwargs):
domain_name = kwargs['domain_name']
scholars = json.loads(response.body)['data'][0]['data']
for i, scholar in enumerate(scholars[:100]):
yield ScholarItem(
name=scholar['person']['name'], org=scholar['org_en'],
field=domain_name, rank=i
)
class JsonWriterPipeline:
def open_spider(self, spider):
self.scholar_rank = defaultdict(lambda: [None] * 100)
self.save_path = spider.save_path
def process_item(self, item, spider):
scholar = ItemAdapter(item).asdict()
self.scholar_rank[scholar.pop('field')][scholar.pop('rank')] = scholar
return item
def close_spider(self, spider):
with open(os.path.join(self.save_path), 'w', encoding='utf8') as f:
json.dump(self.scholar_rank, f, ensure_ascii=False)