import re from urllib import request from urllib import error class Mooc: urlInit = 'https://mooc1.chaoxing.com/course/{{courseId}}.html' urlK = 'https://mooc1.chaoxing.com/nodedetailcontroller/visitnodedetail?courseId={{courseId}}&knowledgeId={{knowledgeId}}' workUrl = 'https://mooc1.chaoxing.com/api/selectWorkQuestion?workId={{workId}}&ut=null&classId=0&courseId={{courseId}}&utenc=null' headers = { 'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3' } def __returnWorkUrl(self, courseId, workId): url = self.workUrl.replace('{{courseId}}', courseId).replace( '{{workId}}', workId) return url def __getRequest(self, url): req = request.Request(url, headers=Mooc.headers) try: page = request.urlopen(req).read() page = page.decode('utf-8') return page except error.URLError as e: print('courseId可能不存在哦!', e.reason) exit() def __getFristData(self, courseId): # 组装初始URL,获取第一个包含knowledge url = self.urlInit.replace('{{courseId}}', courseId) htmls = self.__getRequest(url) # #re_rule = 'courseId='+courseId+'&knowledgeId=(.*)">' #
#
re_rule = 'courseId='+courseId+'&knowledgeId=(.*)">' url_frist = re.findall(re_rule, htmls) if len(url_frist) > 0: return url_frist[0] else: print('courseId错误!') def __returnTitle(self, courseId, knowledgeId): url = self.urlK.replace('{{courseId}}', courseId).replace( '{{knowledgeId}}', knowledgeId) htmls = self.__getRequest(url) re_rule = '":"work-(.*?)"' wordId = re.findall(re_rule, htmls) wordId = list(set(wordId)) # 先转集合,再转队列 去重复 title = [] for x in wordId: wordUrl = self.__returnWorkUrl(courseId, x) html_work = self.__getRequest(wordUrl) title_rule = '