抓取知乎首页的动态的url,使用了一个循环,但是run时,只执行了一遍,没有循环。并且results中没有return的数据。
def __init__(self): self.start_num = -1 self.question_url=[] self.data = {"params":'{"offset":10,"start":-1}',"method":"next"} self.next_url = 'https://www.zhihu.com/node/TopStory2FeedList' def on_start(self): # self.crawl("https://www.zhihu.com",callback=self.detail_page) data = self.change(self.data,self.start_num) if data: self.crawl(url=self.next_url,method='POST',data=data,callback=self.index_page) #处理之后动态加载的网页 注意首页有些是转载的网站 比如专栏,这个的匹配路径不同 def index_page(self, response): if response: #如果访问加载页面成功,那么存储新加载的网页url self.dpage(response,self.start_num,self.question_url) #改变data,实现更多加载 data = self.change(self.data,self.start_num) self.crawl(url=self.next_url,method="POST",data=data,callback=self.detail_page,save={'question_url':self.question_url,'start':self.start_num}) #这个函数循环回调(但是在run的状态下没有执行到这个函数) def detail_page(self, response): try: if response: self.start_num = response.save['start'] self.question_url = response.save['question_url'] #抓取首页网址并存储 self.dpage(response,self.start_num,self.question_url) data = self.change(self.data,self.start_num) self.crawl(url=self.next_url,method="POST",data=data,callback=self.detail_page,save={'question_url':self.question_url,'start':self.start_num}) else: print '没有响应' except BaseException,e: print '到达页面底部,首页已经加载完毕' return {'url':self.question_url} #在results没有数据 #处理抓取首页上的网页 def dpage(self,response,start_num,question_url): num = 0 self.start_num = start_num self.question_url = question_url res = self.deal(response) rea = res('h2.feed-title > a') if rea: for each in rea.items(): if each.hasClass('question_link'): self.question_url.append('https://www.zhihu.com' + each.attr.href) else: self.question_url.append(each.attr.href) num += 1 #下一个将要将在的页面的网址参数start self.start_num = self.start_num +num + 1 for each in self.question_url: print each else: print '匹配出错' #处理问题页面的信息抓取 def questionDeal(self,response): pass def deal(self,response): html = ''.join(response.json['msg']) res = pq(html) return res def change(self,data,start_num): if isinstance(data,dict): data1 = data['params'] #字符串data1变成字典 data1 = json.loads(data1) #修改字典中start的值 data1['start'] = start_num #将字典还原成字符串 data1 = json.dumps(data1) #修改data参数中的params的值 data['params'] = data1 return data else: print '出错! 输入的 data 类型不是字典' return None
你调用的子函数的结果,不在 callback 中 return 是不会被捕获的。