Take a closer look, the default method may be wrong, it should be start_requests, and after modifying some bugs, it may be possible to crawl. The following is the modified source code!
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import scrapy
from diandian.items import diandianitem
from scrapy.http import Request
class myspider(scrapy.Spider):
name = 'novel'
allowed_domain = ['23wx.com']
def start_requests(self):
for i in range(1,11):
url = 'http://www.23wx.com/class/{}_1.html'.format(i)
yield Request(url,callback=self.getallurl)
#得到每个类别的url
def getallurl(self,response):
id = BeautifulSoup(response.body, 'lxml').select('.first')[0]['href'].split('_')[0].split('/')[-1]
maxnumber = BeautifulSoup(response.body, 'lxml').select('.last')[0].text
for j in range(1, int(maxnumber) + 1):
url = 'http://www.23wx.com/class/{}_{}.html'.format(id, j)
yield Request(url, callback=self.getdetail_url)
# 得到每个类别所有页数的url
def getdetail_url(self,response):
for each in BeautifulSoup(response.body, 'lxml').find_all(bgcolor="#FFFFFF"):
detailurl = each.find_all('td', class_='L')[0].find_all('a')[0]['href']
yield Request(detailurl,callback=self.parse)
#得到具体每个小说的url
def parse(self, response):
items = diandianitem()
soup = BeautifulSoup(response.body,'lxml')
items['name'] = soup.select('#content dd h1')[0].text.split(' ')[0]
t1 = soup.find_all('table', cellspacing="1")[0].find_all('tr')[0]
items['category'] = t1.find_all('a')[0].text
items['author'] = t1.find_all('td')[1].text.strip()
items['condition'] = t1.find_all('td')[2].text.strip()
t2 = soup.find_all('table', cellspacing="1")[0].find_all('tr')[1]
items['save'] = t2.find_all('td')[0].text.strip()
items['length'] = t2.find_all('td')[1].text.strip()
items['last_updated'] = t2.find_all('td')[2].text.strip()
yield items
#得到小说的具体信息
As soon as I saw that it was something from Baidu, I knew it was difficult to crawl. Basically, it had been treated to prevent crawling. It would definitely not work if I tried to do it in the normal way. To analyze it in detail, this process seems to be difficult. I wish you good luck
Take a closer look, the default method may be wrong, it should be start_requests, and after modifying some bugs, it may be possible to crawl. The following is the modified source code!
As soon as I saw that it was something from Baidu, I knew it was difficult to crawl. Basically, it had been treated to prevent crawling. It would definitely not work if I tried to do it in the normal way. To analyze it in detail, this process seems to be difficult. I wish you good luck