python - scrapy 接受用户参数,
高洛峰
高洛峰 2017-04-17 17:15:07
0
0
585

需要接受指定爬去的起始url。不知道怎么接受参数合适。
但是如下这样的代码会报错。

File "/usr/local/lib/python2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 67, in _parse_response
        cb_res = callback(response, **cb_kwargs) or ()
    exceptions.TypeError: 'str' object is not callable
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
import os
class dmozspider(CrawlSpider):
    name = 'dmoz'
    def __init__(self,starturl='www.wooyun.org'):
        self.start_urls = ['http://'+starturl]
        self.allowed_domains = [starturl]
        self._rules = (
        Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)
    # start_urls=['http://www.wooyun.org']
    # allowed_domains=['www.wooyun.org']
    # rules=(
    #     Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)

    def parse_item(self, response):
        print response.url
        results = open('url.txt','a')
        results.write(response.url+ os.linesep)
        return True
        
        
        
        

搞定了,这样就可以。you probably redefined init without calling super

class dmozspider(CrawlSpider):
    name = 'dmoz'
    rules = (
        Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)
    # start_urls=['http://www.wooyun.org']
    def __init__(self,starturl='www.wooyun.org',*args, **kwargs):
        super(dmozspider, self).__init__(*args, **kwargs)
        self.start_urls = ['http://'+starturl]
        self.allowed_domains = [starturl]
高洛峰
高洛峰

拥有18年软件开发和IT教学经验。曾任多家上市公司技术总监、架构师、项目经理、高级软件工程师等职务。 网络人气名人讲师,...

membalas semua(0)
Muat turun terkini
Lagi>
kesan web
Kod sumber laman web
Bahan laman web
Templat hujung hadapan