python - 抓取新浪微博返回页面内容排版混乱
高洛峰
高洛峰 2017-04-17 17:20:22
0
0
392

我在抓取新浪微博一个“科技”相关的人物微博时,已经完成登录,get页面返回的数据比较混乱,用beautifulsoup.prettify()问题依旧。还请大神们帮忙看看是不是我的处理方式有问题。
本人菜鸟一枚。

贴上代码:

coding=utf-8

import sys
import urllib
import urllib2
import cookielib
import base64
import re
import json
import hashlib
import os
import rsa
import binascii
import time
import requests
import bs4
import redis
import pdb
import HTMLParser

reload(sys)
sys.setdefaultencoding('utf-8')

r=redis.Redis(host='localhost',port=6379,db=0)
r.delete('user_pool')
r.lpush('user_pool','fuckyouasshole')
r.delete('fans_pool')
r.sadd('fans_pool','fuckyouasshole')

weiboSession=requests.Session()
file = open('test.txt','w')

parameters = {
'entry': 'weibo',
'callback': 'sinaSSOController.preloginCallBack',
'su': 'bGFpcmVuMjAwNg%3D%3D',
'rsakt': 'mod',
'checkpin': '1',
'client': 'ssologin.js(v1.4.5)',
'_': '1457327347813'
}
postdata = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'useticket': '1',
'pagerefer': 'http%3A%2F%2Flogin.sina.com.cn%2Fsso%2Flogout.php%3Fentry%3Dminiblog%26r%3Dhttp%253A%252F%252Fweibo.com%252Flogout.php%253Fbackurl%253D%25252F',
'vsnf': '1',
'su': '',
'service': 'miniblog',
'servertime': '',
'nonce': '',
'pwencode': 'rsa2',
'rsakv': '',
'sp': '',
'encoding': 'UTF-8',
'prelt': '147',
'url': 'http://www.weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&returntype=META'
}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0',
'Accept-Encoding':'deflate, sdch'}
def get_servertime():

url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=bGFpcmVuMjAwNg%3D%3D&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_=1457327347813'
#data = urllib2.urlopen(url).read()
data=weiboSession.get(url).content
p = re.compile('\((.*)\)')
try:
    json_data = p.search(data).group(1)
    data = json.loads(json_data)
    servertime = str(data['servertime'])
    nonce = data['nonce']
    pubkey = data['pubkey']
    rsakv = data['rsakv']
    
    return servertime, nonce, pubkey, rsakv
except:
    print 'Get severtime error!'
    return None

def get_pwd(pwd, servertime, nonce, pubkey):

rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥
message = str(servertime) + '\t' + str(nonce) + '\n' + str(pwd) #拼接明文 js加密文件中得到
passwd = rsa.encrypt(message, key) #加密
passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制
return passwd

def get_user(username):

username_ = urllib.quote(username)
username = base64.encodestring(username_)[:-1]
return username

def login(username, pwd):

url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)'
try:
    servertime, nonce, pubkey, rsakv = get_servertime()
except:
    return
global postdata

postdata['nonce'] = nonce
postdata['rsakv'] = rsakv
postdata['su'] = get_user(username)
postdata['sp'] = get_pwd(pwd, servertime, nonce, pubkey)
postdata['servertime'] = time.time()
result=weiboSession.post(url,headers=headers,data=postdata)
text = result.content
p = re.compile('location\.replace\(\'(.*?)\'\)')
try:
    login_url = p.search(text).group(1)
    response=weiboSession.get(login_url,headers=headers)
    print u"登录成功!"    
except:
    print 'Login error!'

login('@163.com','**')

for i in range(0,1):

while(True):
    try:
        if i==0:
            response=weiboSession.get('http://d.weibo.com/1087030002_2975_2009_0#',headers=headers)
        else:
            response=weiboSession.get('http://d.weibo.com/1087030002_2975_2009_0?page=%d#Pl_Core_F4RightUserList__4'%i,headers=headers)
        s=response.content.decode("string_escape")
        if re.search(r'<li class=(.*?)li>',s):
            print re.search(r'<li class=\"follow_item S_line2\">.*?<\\\/li>',s).group(0)
        soup_page=bs4.BeautifulSoup(s,"html.parser")#,
        file.write(soup_page.prettify())
        file.close()
        #pdb.set_trace()
        for userlist in soup_page.find('ul',class_='follow_list').find_all('li'):
            user_url=userlist.dl.dd.p.a['href']
            #print user_url
            r.lpush('user_pool',user_url)
        break
    except Exception,e:
        print Exception,':',e
        #time.sleep(10)
        pass

print r.rpop('user_pool')

file记录的内容排版非常混乱,导致用
soup_page.find('ul',class_='follow_list').find_all('li') 没法找到follow_list。提示:'None Type'object has no attribute 'find_all'

高洛峰
高洛峰

拥有18年软件开发和IT教学经验。曾任多家上市公司技术总监、架构师、项目经理、高级软件工程师等职务。 网络人气名人讲师,...

répondre à tous(0)
Derniers téléchargements
Plus>
effets Web
Code source du site Web
Matériel du site Web
Modèle frontal
À propos de nous Clause de non-responsabilité Sitemap
Site Web PHP chinois:Formation PHP en ligne sur le bien-être public,Aidez les apprenants PHP à grandir rapidement!