用python模拟登陆一个网站,一直遇到404问题,求指导!
代码
import scrapy
dari scrapy.http Permintaan import, FormRequest
dari scrapy.selector import Selector
kelas StackSpiderSpider(scrapy.Spider):
name = "stack_spider"
start_urls = ['https://stackoverflow.com/']
headers = {
"host": "cdn.sstatic.net",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Content-Type":" application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0"
}
#重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数
def start_requests(self) :
return [Request("https://stackoverflow.com/users/login",
meta = {
# 'dont_redirect': True,
# 'handle_httpstatus_list': [302],
'cookiejar' : 1},
callback = self.post_login)] #添加了meta
#FormRequeset
def post_login(self, response) :
# 请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单
fkey = Selector(response).xpath('//input[@name="fkey"]/@value').extract()[0]
ssrc = Selector(response).xpath('//input[@name="ssrc"]/@value').extract()[0]
print fkey
print ssrc
#FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单
#登陆成功后, 会调用after_login回调函数
return [FormRequest.from_response(response,
meta = {
# 'dont_redirect': True,
# 'handle_httpstatus_list': [302],
'cookiejar' : response.meta['cookiejar']}, #注意这里cookie的获取
headers = self.headers,
formdata = {
"fkey":fkey,
"ssrc":ssrc,
"email":"1045608243@qq.com",
"password":"12345",
"oauth_version":"",
"oauth_server":"",
"openid_username":"",
"openid_identifier":""
},
callback = self.after_login,
dont_filter = True
)]
def after_login(self, response) :
filename = "1.html"
with open(filename,'wb') as fp:
fp.write(response.body)
# print response.body
调试信息
2017-04-18 11:19:23 [scrapy.utils.log] INFO: Scrapy 1.3.3 bermula (bot: text5)
2017-04-18 11:19:23 [scrapy.utils.log ] MAKLUMAT: Tetapan terbalik: {'NEWSPIDER_MO
DULE': 'text5.spiders', 'SPIDER_MODULES': ['text5.spiders'], 'BOT_NAME': 'text5'
}
2017-04-18 11:19: 23 [scrapy.middleware] INFO: Sambungan didayakan:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
020 18 11:19:24 [scrapy.middleware] Info: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.htpauthmiddleware',
'scrapy.downloadmiddlewares.downoadday.downsdarddware' ultheadersmiddleware ' ,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapyH.pcompressionsmiddleware' .downloadermiddlewares .redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-04-18 11:19:24 [scrapy.middlewares:Enternetware]
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.Urlware. scrapy.spidermiddlewares .depth.DepthMiddleware']
2017-04-18 11:19:24 [scrapy.middleware] INFO: Saluran paip item yang didayakan:
[]
2017-04-18 11:19:24 [scrapy.core.engine] IN : Labah-labah dibuka
2017-04-18 11:19:24 [scrapy.extensions.logstats] INFO: Merangkak 0 halaman (pada 0 pag
es/min), mengikis 0 item (pada 0 item/min)
2017-04 -18 11:19:24 [scrapy.extensions.telnet] DEBUG: Konsol Telnet mendengar o
n 127.0.0.1:6023
2017-04-18 11:19:24 [scrapy.core.engine] DEBUG: Dirangkak (200 ) <DAPATKAN https://stack
overflow.com/users/login> (perujuk: Tiada)
1145f3f2e28e56c298bc28a1a735254b
2017-04-18 11:19:25 [scrapy.core.engine] DEBUG: Dirangkak (404) <DAPATKAN https://stack
overflow.com/search?q=&ssrc=&openid_username=&oauth_server=&oauth_version=&fkey =
1145f3f2e28e56c298bc28a1a735254b&password=wanglihong1993&email=1067863906%40qq.c
om&openid_identifier=> (perujuk: https://stackoverflow.com/use...
2017-04-18 11:19:25 [scrapy.spidermiddlewares.httperror] INFO: Mengabaikan respons
<404 https://stackoverflow.com/sea ...
auth_version=&fkey=1145f3f2e28e56c298bc28a1a735254b&password=wanglihong1993&emai
l=1067863906%40qq.com&openid_identifier=>7 tidak dibenarkan🜎0 kod HTTP-1 8 11:19:25 [scrapy.core. enjin] INFO: Labah-labah penutup (selesai)
2017-04-18 11:19:25 [scrapy.statscollectors] INFO: Lambakan statistik Scrapy:
{'downloader/request_bytes': 881,
'downloader/request_count': 2,
'pemuat turun/kaedah_permintaan/DAPAT': 2,
'pemuat turun/bait_balas': 12631,
'pemuat turun/bilangan_tindak balas': 2,
'pemuat turun/bilangan_tindak balas/200': 1,
stat 'pemuat turun'/bilangan1 ,
'finish_reason': 'finish',
'finish_time': datetime.datetime(2017, 4, 18, 3, 19, 25, 143000),
'log_count/DEBUG': 3,
'log_count/INFO': 8,
'request_depth_max': 1,
'response_received_count': 2,
'penjadual/ditunda': 2,
'penjadual/dequeued/memori': 2,
'scheduler/enqueued': 2,
/memory': 2,
'start_time': datetime.datetime(2017, 4, 18, 3, 19, 24, 146000)}
2017-04-18 11:19:25 [scrapy.core.engine] INFO: Labah-labah ditutup (selesai)
Abang, kata laluan awak bocor