改写了网上一个爬取ip代理并验证的代码,验证不是去登陆度娘,而是换了ip查询网站,原以为会用爬取的代理IP,实际上ip并没有变化,烦请大牛看下什么原因,附上运行结果:
{'https': u'183.221.50.139:8123'} 您的IP:[218.88.XX.XX] 来自:四川省成都市 电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7
{'https': u'116.236.216.116:8080'} 您的IP:[218.88.XX.XX] 来自:四川省成都市 电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7
{'https': u'183.221.160.44:8123'} 您的IP:[218.88.XX.XX] 来自:四川省成都市 电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7
代码如下
import requests
from lxml import etree
from bs4 import BeautifulSoup as bs
import Queue
import threading
import time
import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# write proxy
def writeproxy(porxyinfo):
writefile = file('porxyinfo.txt','a+')
writefile.write(porxyinfo)
writefile.write('\n')
writefile.close()
# return page code
def GetPageText(url):
r = requests.get(url)
return r.text
# return post urllist
def GetPostUrl(source):
posturllist = []
iplist = bs(source).find("table",{"id":"ip_list"}).findAll("tr")[1:]
for item in iplist:
getinfo = item.findAll("td")
ip = getinfo[1].get_text(strip='\r\n')
port = getinfo[2].get_text(strip='\r\n')
address = getinfo[3].get_text(strip='\r\n')
type = getinfo[5].get_text(strip='\r\n')
posturllist.append(type.lower()+'#'+ip+':'+port)
return posturllist
def Checkproxy(porxyinfo):
proxies = {}
if porxyinfo.split('#')[0] == 'http':
proxies['http'] = porxyinfo.split('#')[1]
else:
proxies['https'] = porxyinfo.split('#')[1]
r = requests.get("http://ip.chinaz.com/", proxies=proxies,timeout=3)
if r:
print proxies, bs(requests.get('http://ip.chinaz.com/').content).find("span",{"class":"info3"}).get_text(strip='\r\n')
# writeproxy(porxyinfo)
else:
print 'No'
def getproxyid():
start = time.time()
queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
global mutex
def run(self):
while True:
porxyinfo = self.queue.get()
try:
mutex.acquire(5)
try:
Checkproxy(porxyinfo)
except:
time.sleep(0.15)
mutex.release()
self.queue.task_done()
continue
time.sleep(0.15)
mutex.release()
self.queue.task_done()
except Exception,e:
time.sleep(0.15)
self.queue.task_done()
pagenum =5
targets = ['http://www.xici.net.co/nn/%d'%page for page in range(1,pagenum+1)]
targets += ['http://www.xici.net.co/wn/%d'%page for page in range(1,pagenum+1)]
for proxyurl in targets:
try:
PageText = GetPageText(proxyurl)
except Exception,e:
print e
break
PostUrlList = GetPostUrl(PageText)
mutex = threading.Lock()
for i in range(5):
t = ThreadUrl(queue)
t.setDaemon(True)
try:
t.start()
except:
pass
for host in PostUrlList:
queue.put(host)
queue.join()
print "Elapsed Time: %s" % (time.time() - start)
if __name__ == '__main__':
getproxyid()
What are you doing? ? Use a proxy to request once, and if successful, then request again without using a proxy, and check the return data this time ? Shouldn't you use
bs(r.content)....
?PS: Use curl’s User-Agent to access http://ip.cn, without parsing, the results will be output directly.