需要接受指定爬去的起始url。不知道怎么接受参数合适。
但是如下这样的代码会报错。
File "/usr/local/lib/python2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 67, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
exceptions.TypeError: 'str' object is not callable
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
import os
class dmozspider(CrawlSpider):
name = 'dmoz'
def __init__(self,starturl='www.wooyun.org'):
self.start_urls = ['http://'+starturl]
self.allowed_domains = [starturl]
self._rules = (
Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)
# start_urls=['http://www.wooyun.org']
# allowed_domains=['www.wooyun.org']
# rules=(
# Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)
def parse_item(self, response):
print response.url
results = open('url.txt','a')
results.write(response.url+ os.linesep)
return True
搞定了,这样就可以。you probably redefined init without calling super
class dmozspider(CrawlSpider):
name = 'dmoz'
rules = (
Rule(LinkExtractor(allow=()), callback="parse_item", follow = True),)
# start_urls=['http://www.wooyun.org']
def __init__(self,starturl='www.wooyun.org',*args, **kwargs):
super(dmozspider, self).__init__(*args, **kwargs)
self.start_urls = ['http://'+starturl]
self.allowed_domains = [starturl]
拥有18年软件开发和IT教学经验。曾任多家上市公司技术总监、架构师、项目经理、高级软件工程师等职务。 网络人气名人讲师,...