python - scrapy爬取手机版微博weibo.cn模拟登录出先问题
阿神
阿神 2017-04-18 10:32:50
0
1
830

代码如下,不知道为什么一直不能成功登录

># -*- coding: utf-8 -*-
import scrapy
import re
import requests
#import urllib
from bs4 import BeautifulSoup

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, Join
from scrapy.http import Request,FormRequest

from getweibo.items import InformationItem,TweetsItem

loginURL = "https://login.weibo.cn/login/"
#获得验证码等信息
def get_captchainfo(loginURL):
    html = requests.get(loginURL).content
    bs = BeautifulSoup(html,"lxml")
       #print bs
       #注意通过bs.select元素寻找对象,返回的是列表对象
    password_name = (bs.select('input[type="password"]'))[0].get('name')
    vk = (bs.select('input[name="vk"]'))[0].get('value')
    capId = (bs.select('input[name="capId"]'))[0].get('value')
    #print password_name,vk,capId
    captcha_img = bs.find("img", src=re.compile('http://weibo.cn/interface/f/ttt/captcha/')).get('src')
    print captcha_img
    #captchaid可以从验证码图片地址中直接截取获得
    #urllib.urlretrieve(captcha_img, 'weibo_spider/image/captcha.jpg')
    #print "captcha download success!"
    captcha_input = raw_input("please input the captcha\n>")

    return (captcha_input,password_name,vk,capId)

class WeiboSpider(CrawlSpider):
    name = 'weibo'
    allowed_domains = ['weibo.cn']
    start_urls = ['http://weibo.cn/dafendi']#先暂时确定精分君的微博,之后start_urls可以从文件提取
 
    rules = (
        Rule(LinkExtractor(restrict_xpaths='//*[@id="pagelist"]/form/p/a')),
        Rule(LinkExtractor(restrict_xpaths='//*[contains(@href,"repost")]'),callback='parse_item')
    )
  
    headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Connection": "keep-alive",
    "Content-Type":" application/x-www-form-urlencoded",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
    "Referer": "https://login.weibo.cn/login/"
    }
    # Start on the welcome page
    def start_requests(self):
        return [
            Request(
                loginURL,
                meta = {'cookiejar': 1},
                headers=self.headers,
                callback=self.parse_login)
        ]

    # Post welcome page's first form with the given user/pass
    def parse_login(self, response):
        print 'Preparing login'
        captcha=get_captchainfo(loginURL)
        print captcha
       
        return FormRequest.from_response(
            response,#from loginURL
            method="POST",
            meta = {'cookiejar' : response.meta['cookiejar']},#获取cookies
            headers = self.headers,
            formdata = {
                    "mobile": "帐号",
                    captcha[1]: "密码",
                    "code": captcha[0],
                    "remember":"on",
                    "backurl": "http%3A%2F%2Fweibo.cn",
                    "backtitle":u'手机新浪网',
                    "tryCount":"",
                    "vk": captcha[2],
                    "capId": captcha[3],
                    "submit": u'登录'},
            callback = self.after_login,
            dont_filter = True
        )
    def after_login(self, response) :
        for url in self.start_urls :
            yield self.make_requests_from_url(url)

    def parse_start_url(self, response):#用来处理初始response
        html =  response.xpath('/html').extract()
        print html
           # Create the loader using the response
       
        l = ItemLoader(item=InformationItem(), response=response)

       # Load fields using XPath expressions
        l.add_xpath('id_', '//title/text()', MapCompose(lambda i:i[0:len(i)-3])),
        l.add_xpath('Info','//span[contains(@class,"ctt")][2]/text()'),
        l.add_xpath('Num_Tweets','//span[contains(@class,"tc")]/text()',MapCompose(lambda i: i[(i.index("[")+1):(i.index("]"))])),
        l.add_xpath('Num_Follows','//a[contains(@href,"follow")]/text()',MapCompose(lambda i: i[(i.index("[")+1):(i.index("]"))])),

        l.add_xpath('Num_Fans','//a[contains(@href,"fans")]/text()',MapCompose(lambda i: i[(i.index("[")+1):(i.index("]"))])),
        return l.load_item()

    def parse_item(self, response):
         l = ItemLoader(item=TweetsItem(), response=response)
         l.add_xpath('Content','//span[contains(@class,"ctt")]/text()')
         #l.add_xpath('')
         return l.load_item()

下边settins.py的内容

ROBOTSTXT_OBEY = False



HTTPERROR_ALLOWED_CODES = [302,]#返回400时按正常的返回对待
REDIRECT_ENABLED = False #关掉重定向,不会重定向到新的地址


DOWNLOAD_DELAY = 3

COOKIES_ENABLED = True
COOKIES_DEBUG = True

下边是输出

2017-04-09 15:53:17 [scrapy] DEBUG: Sending cookies to: <POST https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>
Cookie: _T_WM=6348fb8a523fe1bc486f14d1304cf0d2

2017-04-09 15:53:19 [scrapy] DEBUG: Received cookies from: <302 https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn

Set-Cookie: SUB=_2A2517Zg9DeRhGeVG61ER8yrEwzyIHXVXETh1rDV6PUJbkdAKLRXgkW0wSZc8S6dp1d-NlyAraSqa-1-_0Q..; expires=Tue, 09-May-2017 07:53:17 GMT; path=/; domain=.weibo.cn; httponly

Set-Cookie: gsid_CTandWM=4uuCcdef1lRXUEnMtsgL1fXlgec; expires=Tue, 09-May-2017 07:53:19 GMT; path=/; domain=.weibo.cn; httponly

2017-04-09 15:53:19 [scrapy] DEBUG: Crawled (302) <POST https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4> (referer: https://login.weibo.cn/login/)
2017-04-09 15:53:20 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/dafendi>
Set-Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0; expires=Tue, 09-May-2017 07:53:19 GMT; path=/; domain=.weibo.cn; httponly

Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn

2017-04-09 15:53:20 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/dafendi> (referer: https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4)
2017-04-09 15:53:20 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/dafendi>
{'Info': [u'\u8ba4\u8bc1\uff1a\u77e5\u540d\u5e7d\u9ed8\u535a\u4e3b \u5fae\u535a\u7b7e\u7ea6\u81ea\u5a92\u4f53'],
 'Num_Fans': [u'2055326'],
 'Num_Follows': [u'891'],
 'Num_Tweets': [u'1958'],
 'id_': [u'\u7cbe\u5206\u541b']}
2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0

2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0

2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0

2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0

2017-04-09 15:53:24 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn

2017-04-09 15:53:24 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:24 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>
{'Content': [u':',
             u' \u5047\u5982\u4efb\u4f55\u4e8b\u90fd\u80fd\u6210\u4e3a\u804c\u4e1a\uff0c\u4f60\u4f1a\u9009\u62e9\u4ec0\u4e48\u4f5c\u4e3a\u804c\u4e1a\uff1f \u200b\u200b\u200b']}
2017-04-09 15:53:28 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn

2017-04-09 15:53:28 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:28 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>
{'Content': [u'\u7279\u522b\u7684\u751f\u65e5\u793c\u7269\u3002 \u200b\u200b\u200b']}
2017-04-09 15:53:32 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn

2017-04-09 15:53:32 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:32 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>
{'Content': [u'\u7231\u7b11\u7684\u5973\u5b69\u5b50\uff0c\u8fd0\u6c14\u4e00\u5b9a\u4e0d\u4f1a\u592a\u597d\u2026\u2026',
             u' \u200b\u200b\u200b']}
2017-04-09 15:53:36 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn

2017-04-09 15:53:36 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:36 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>
{'Content': [u':\u4e00\u4e2a\u957f\u5fae\u535a\u5408\u96c6\uff0c\u5927\u5bb6\u65e0\u804a\u53c8\u6ca1\u770b\u8fc7\u7684\u8bdd\u53ef\u4ee5\u770b\u770b[\u7f9e\u55d2\u55d2] \u200b\u200b\u200b']}
2017-04-09 15:53:36 [scrapy] INFO: Closing spider (finished)
2017-04-09 15:53:36 [scrapy] INFO: Stored json feed (5 items) in: wanghongmingdan.json
2017-04-09 15:53:36 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 3029,
 'downloader/request_count': 7,
 'downloader/request_method_count/GET': 6,
 'downloader/request_method_count/POST': 1,
 'downloader/response_bytes': 22746,
 'downloader/response_count': 7,
 'downloader/response_status_count/200': 6,
 'downloader/response_status_count/302': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2017, 4, 9, 7, 53, 36, 596076),
 'item_scraped_count': 5,
 'log_count/DEBUG': 27,
 'log_count/INFO': 8,
 'log_count/WARNING': 2,
 'request_depth_max': 3,
 'response_received_count': 7,
 'scheduler/dequeued': 7,
 'scheduler/dequeued/memory': 7,
 'scheduler/enqueued': 7,
 'scheduler/enqueued/memory': 7,
 'start_time': datetime.datetime(2017, 4, 9, 7, 53, 2, 180831)}
2017-04-09 15:53:36 [scrapy] INFO: Spider closed (finished)

2017-04-09 20:11:50 [scrapy] DEBUG: Redirecting (302) to <GET http://weibo.cn/crossDomain/?g=4uegcdef1d93rkj4S3ZomfXlgec&t=1491739909&m=9144&r=&u=http%3A%2F%2Fweibo.cn%3Fgsid%3D4uegcdef1d93rkj4S3ZomfXlgec%26PHPSESSID%3D%26vt%3D4&cross=1&st=ST-MzgwMzAzNDg4MA==-1491739909-tc-27ED8C8D7528C9185E75F7986B8050B7-1,ST-MzgwMzAzNDg4MA==-1491739909-tc-BED83CC16AC311D2BBA234E8F08BBD39-1> from <POST https://login.weibo.cn/login/?rand=842328789&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>
2017-04-09 20:11:50 [scrapy] DEBUG: Redirecting (meta refresh) to <GET http://weibo.cn/> from <GET http://weibo.cn/crossDomain/?g=4uegcdef1d93rkj4S3ZomfXlgec&t=1491739909&m=9144&r=&u=http%3A%2F%2Fweibo.cn%3Fgsid%3D4uegcdef1d93rkj4S3ZomfXlgec%26PHPSESSID%3D%26vt%3D4&cross=1&st=ST-MzgwMzAzNDg4MA==-1491739909-tc-27ED8C8D7528C9185E75F7986B8050B7-1,ST-MzgwMzAzNDg4MA==-1491739909-tc-BED83CC16AC311D2BBA234E8F08BBD39-1>
阿神
阿神

闭关修行中......

모든 응답(1)
PHPzhong

시뮬레이션 로그인을 할 때 패킷 캡처 소프트웨어를 열어서 패킷을 캡처하고 디버깅하는 것이 좋습니다. 이를 통해 프로그램을 통해 대상 서버에 요청하여 반환되는 내용과 서버에서 반환되는 내용에 차이가 있는지 알 수 있습니다. 수동으로 서버를 요청합니다. 저도 웨이보 데이터를 수집한 경험이 있는데 방금 귀하의 코드를 살펴보니 이전에 작성한 웨이보 시뮬레이션 로그인과 다소 다른 것을 발견했습니다. 방금 확인한 코드인데 여전히 작동합니다. 두 코드의 차이점을 다시 비교해보니 웨이보 왑버전을 캡쳐하셨음에도 불구하고 UA가 PC측 UA를 사용하고 있어서 인증코드가 뜨고 제출한 매개변수도 다른 것으로 나타났습니다 🎜>. 코드의 오류는 에 수동으로 액세스해야 하는 단계 점프가 있다는 것입니다. 이를 보려면 패킷을 캡처할 수 있습니다. 이제 우리는 Weibo wap 측의 크롤링 방지에 더 많은 관심을 기울이기 시작한 것 같습니다. Weibo에 대한 시뮬레이션된 로그인을 더 잘 이해하려면 이 기사를 읽어보세요. 현재로서는 이 방법이 가능합니다

최신 다운로드
더>
웹 효과
웹사이트 소스 코드
웹사이트 자료
프론트엔드 템플릿
회사 소개 부인 성명 Sitemap
PHP 중국어 웹사이트:공공복지 온라인 PHP 교육,PHP 학습자의 빠른 성장을 도와주세요!