# -*- encoding: utf8 -*-
import urllib
import urllib2
import re
page = 1
url = u'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
pattern = re.compile(r'<h2>(.*?)</h2>',re.S)
items = re.findall(pattern,content)
print items
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
一个简单的爬虫,但无法输出中文,我估计是正则表达式的匹配问题,但不知道具体问题在哪.输出的都是[u'u5c0fu674eu98deu55b5', u'xiaofengxing', u'u4e00u65e5uff01u5c31u662f24u5c0fu65f6', u'u975eu8981u8f93u540du5b57', u'u6f47u6e58u58a8u5170', u'u6211u4e0du60f3u8bf4u6211u8fd8u7231u4f60', u'(u7cd7u540du662du8457)~u96f7u75af', u'u597du60f3*', u'u5f97u4e00u4ebau5fc3u4e0du518du76f8u4eb2', u'u6881u5c71u59d3u6b66u540du5927u90ce', u'u53eau8981u4e00u534au7684u4f60', u'u83abu7eb3u7684u4e16u754c', u'u8499u9762u8d85u4ebau4ed8u5c0fu65ed', u'u4f60u548bu8fd9u4e48u9017u5462', u'u6740u732au7684u6731u5e08u5085', u'u533fu540du7528u6237', u'u91d1u811au5370', u'u5341u4e8cu6708u4e09u5341u4e00', u'u533fu540du7528u6237', u'<u7cd7u72afu76d1u72f1>~u5165u5e93']
으아악
이것을 시도해 보세요
json.dumps(items, verify_ascii=False) 인쇄
크롤러는 스타일을 어떻게 처리합니까