如何使用Python爬蟲來進行JS載入資料網頁的爬取-js教程-PHP中文網

首頁

web前端

js教程

如何使用Python爬蟲來進行JS載入資料網頁的爬取

php中世界最好的语言

Mar 06, 2018 am 11:39 AM

javascript python 使用

這次帶給大家如何使用Python爬蟲來進行JS載入資料網頁的爬取，使用Python爬蟲來進行JS載入資料網頁爬取的注意事項有哪些，以下就是實戰案例，一起來看一下。

例如簡書:Paste_Image.png我們來寫個程式,爬取簡書網站隨便一個作者的所有文章,再對其所有文章進行分詞統計程式運行統計的結果見文章:我統計了彭小六簡書360篇文章中使用的詞語需要的Python包包名作用selenium用於和phantomjs合作模擬瀏覽器訪問網頁lxml用於對html頁面的解析,提取數據jieba用於對文章正文分詞tld解析url,例如提取domain還需要下載phantomjs,selenium配合Paste_Image.png

我們來寫個程式,爬取簡書網站隨便一個作者的所有文章,再對其所有文章進行分詞統計
程式運行統計的結果見文章:
我統計了彭小六簡書360篇文章中使用的詞語

需要的Python包

#作用

selenium 用於和phantomjs合作模擬瀏覽器存取網頁

lxml 用於對html頁面的解析,提取資料

jieba 用於文章正文分詞

jieba 用於對文章正文分詞文章正文分詞

##tld 解析url, 例如提取domain

#也需要下載phantomjs,selenium配合phantomjs的使用程式碼中有體現

下載位址: http://phantomjs.org/

下面程式碼中,由於使用檔案儲存資料,而沒有使用資料庫儲存資料,所以程式碼量比較多,其中主要程式碼並不多

直接上程式碼

# -*-coding:utf-8-*- 
import json 
import os, sys 
from random import randint 
from collections import Counter 
import jieba 
from lxml import etree 
from selenium import webdriver 
import time 
from tld import get_tld 
path = os.path.abspath(os.path.dirname(file)) 
class Spider(): 
&#39;&#39;&#39; 
获取简书作者的全部文章页面,并解析 
&#39;&#39;&#39; 
def init(self, start_url):&#39;&#39;&#39;我这里使用文件保存数据,没有使用数据库保存数据所有需要初始化文件保存路径使用本程序的你可以把文件保存改成数据库保存,建议使用nosql方便保存start_url:作者文章列表页面,比如http://www.jianshu.com/u/65fd4e5d930d:return:&#39;&#39;&#39;self.start_url = start_urlres = get_tld(self.start_url, as_object=True, fix_protocol=True)self.domain = "{}.{}".format(res.subdomain, res.tld)self.user_id = self.start_url.split("/")[-1]# 保存作者文章列表html页面post_list_dir = &#39;{}/post-list&#39;.format(path)self.post_lists_html = &#39;{}/post_list_{}.html&#39;.format(post_list_dir, self.user_id)# 保存作者所有文章的urlself.post_lists_urls = &#39;{}/urls_{}.dat&#39;.format(post_list_dir, self.user_id)# 保存文章原始网页:self.posts_html_dir = &#39;{}/post-html/{}&#39;.format(path, self.user_id)# 保存文章解析后的内容:self.posts_data_dir = &#39;{}/post-data/{}&#39;.format(path,self.user_id)# 保存文章统计后的结果:self.result_dir = &#39;{}/result&#39;.format(path)self.executable_path=&#39;{}/phantomjs-2.1.1-linux-x86_64/bin/phantomjs&#39;.format(path)# mkdirif not os.path.exists(self.posts_html_dir): os.makedirs(self.posts_html_dir)if not os.path.exists(self.posts_data_dir): os.makedirs(self.posts_data_dir)if not os.path.exists(post_list_dir): os.makedirs(post_list_dir)if not os.path.exists(self.result_dir): os.makedirs(self.result_dir)# 网上随笔找的免费代理ipself.ips = [&#39;61.167.222.17:808&#39;,&#39;58.212.121.72:8998&#39;, &#39;111.1.3.36:8000&#39;, &#39;125.117.133.74:9000&#39;] 
def post_list_page(self):&#39;&#39;&#39;获取文章列表页面,以及文章链接:return:&#39;&#39;&#39;obj = webdriver.PhantomJS(executable_path=self.executable_path)obj.set_page_load_timeout(30)obj.maximize_window()# 随机一个代理ipip_num = len(self.ips)ip = self.ips[randint(0,ip_num-1)]obj.http_proxy = ipobj.get(self.start_url)# 文章总数量sel = etree.HTML(obj.page_source)r = sel.xpath("//div[@class=&#39;main-top&#39;]//div[@class=&#39;info&#39;]//li[3]//p//text()")if r: crawl_post_n = int(r[0])else: print("[Error] 提取文章总书的xpath不正确") sys.exit()n = crawl_post_n/9i = 1while n: t = randint(2,5) time.sleep(t) js = "var q=document.body.scrollTop=100000" # 页面一直下滚 obj.execute_script(js) n -= 1 i += 1# 然后把作者文章列表页面的html(保存到数据库,或文本保存)of = open(self.post_lists_html, "w")of.write(obj.page_source)of.close()# 我们也顺便把作者所有的文章链接提取出来(保存到数据库,或文本保存)of = open(self.post_lists_urls, "w")sel = etree.HTML(obj.page_source)results = sel.xpath("//div[@id=&#39;list-container&#39;]//li//a[@class=&#39;title&#39;]/@href")for result in results: of.write("http://{}{}".format(self.domain, result.strip())) of.write("/n")of.close() 
def posts_html(self):&#39;&#39;&#39;获取文章页面html:return:&#39;&#39;&#39;of = open(self.post_lists_urls)urls = of.readlines()ip_num = len(self.ips)obj = webdriver.PhantomJS(executable_path=self.executable_path)obj.set_page_load_timeout(10)obj.maximize_window()for url in urls: # 随机一个代理ip ip = self.ips[randint(0,ip_num-1)] obj.http_proxy = ip url = url.strip() print("代理ip:{}".format(ip)) print("网页:{}".format(url)) try: obj.get(url) except: print("Error:{}".format(url)) post_id = url.split("/")[-1] of = open("{}/{}_{}.html".format(self.posts_html_dir, obj.title, post_id), "w") of.write(obj.page_source) of.close() t = randint(1,5) time.sleep(t) 
def page_parsing(self):&#39;&#39;&#39;html解析:return:&#39;&#39;&#39;# 只获取匹配的第一个xpath_rule_0 ={ "author":"//div[@class=&#39;author&#39;]//span[@class=&#39;name&#39;]//text()", # 作者名字 "author_tag":"//div[@class=&#39;author&#39;]//span[@class=&#39;tag&#39;]//text()",# 作者标签 "postdate":"//div[@class=&#39;author&#39;]//span[@class=&#39;publish-time&#39;]//text()", # 发布时间 "word_num":"//div[@class=&#39;author&#39;]//span[@class=&#39;wordage&#39;]//text()",#字数 "notebook":"//div[@class=&#39;show-foot&#39;]//a[@class=&#39;notebook&#39;]/span/text()",#文章属于的目录 "title":"//div[@class=&#39;article&#39;]/h1[@class=&#39;title&#39;]//text()",#文章标题}# 获取匹配的所有,并拼接成一个字符串的xpath_rule_all_tostr ={ "content":"//div[@class=&#39;show-content&#39;]//text()",#正文}# 获取匹配的所有,保存数组形式xpath_rule_all ={ "collection":"//div[@class=&#39;include-collection&#39;]//a[@class=&#39;item&#39;]//text()",#收入文章的专题}# 遍历所有文章的html文件,如果保存在数据库的则直接查询出来list_dir = os.listdir(self.posts_html_dir)for file in list_dir: file = "{}/{}".format(self.posts_html_dir, file) if os.path.isfile(file): of = open(file) html = of.read() sel = etree.HTML(html) of.close() # 解析 post_id = file.split("_")[-1].strip(".html") doc = {&#39;url&#39;:&#39;http://{}/p/{}&#39;.format(self.domain,post_id)} for k,rule in xpath_rule_0.items(): results = sel.xpath(rule) if results: doc[k] = results[0] else: doc[k] = None for k,rule in xpath_rule_all_tostr.items(): results = sel.xpath(rule) if results: doc[k] = "" for result in results: if result.strip(): doc[k] = "{}{}".format(doc[k], result) else: doc[k] = None for k,rule in xpath_rule_all.items(): results = sel.xpath(rule) if results: doc[k] = results else: doc[k] = None if doc["word_num"]: doc["word_num"] = int(doc["word_num"].strip(&#39;字数&#39;).strip()) else: doc["word_num"] = 0 # 保存到数据库或者文件中 of = open("{}/{}.json".format(self.posts_data_dir, post_id), "w") of.write(json.dumps(doc)) of.close() 
def statistics(self):&#39;&#39;&#39;分开对每篇文章的进行分词统计,也统计全部文章分词:return: &#39;&#39;&#39;# 遍历所有文章的html文件,如果保存在数据库的则直接查询出来word_sum = {} #正文全部词语统计title_word_sum = {} #标题全部词语统计post_word_cnt_list = [] #每篇文章使用的词汇数量# 正文统计数据保存list_dir = os.listdir(self.posts_data_dir)for file in list_dir: file = "{}/{}".format(self.posts_data_dir, file) if os.path.isfile(file): of = open(file) str = of.read() doc = json.loads(str) # 正文统计:精确模式,默认hi精确模式,所以可以不指定cut_all=False words = jieba.cut(doc["content"], cut_all=False) data = dict(Counter(words)) data = sorted(data.iteritems(), key=lambda d: d[1], reverse=True) word_cnt = 0 for w in data: # 只统计超过1个字的词语 if len(w[0]) < 2: continue # 统计到全部文章词语中 if w[0] in word_sum: word_sum[w[0]]["cnt"] += w[1] word_sum[w[0]]["post_cnt"] += 1 else: word_sum[w[0]] = {} word_sum[w[0]]["cnt"] = w[1] word_sum[w[0]]["post_cnt"] = 1 word_cnt += 1 post_word_cnt_list.append((word_cnt, doc["postdate"], doc["title"], doc["url"])) # 标题统计:精确模式,默认hi精确模式,所以可以不指定cut_all=False words = jieba.cut(doc["title"], cut_all=False) data = dict(Counter(words)) data = sorted(data.iteritems(), key=lambda d: d[1], reverse=True) for w in data: # 只统计超过1个字的词语 if len(w[0]) < 2: continue # 统计到全部文章词语中 if w[0] in title_word_sum: title_word_sum[w[0]]["cnt"] += w[1] title_word_sum[w[0]]["post_cnt"] += 1 else: title_word_sum[w[0]] = {} title_word_sum[w[0]]["cnt"] = w[1] title_word_sum[w[0]]["post_cnt"] = 1 post_word_cnt_list = sorted(post_word_cnt_list, key=lambda d: d[0], reverse=True)wf = open("{}/content_statis_{}.dat".format(self.result_dir, self.user_id), "w")wf.write("| 词语 | 发布日期 | 标题 | 链接 |/n")for pw in post_word_cnt_list: wf.write("| {} | {} | {}| {}|/n".format(pw[0],pw[1],pw[2],pw[3]))wf.close()# 全部文章正文各词语 按使用次数 统计结果wf = open("{}/content_statis_sum_use-num_{}.dat".format(self.result_dir, self.user_id), "w")word_sum_t = sorted(word_sum.iteritems(), key=lambda d: d[1][&#39;cnt&#39;], reverse=True)wf.write("| 分词 | 使用次数 | 使用的文章数量|/n")for w in word_sum_t: wf.write("| {} | {} | {}|/n".format(w[0], w[1]["cnt"], w[1]["post_cnt"]))wf.close()# 全部文章正文各词语 按使用文章篇数 统计结果wf = open("{}/content_statis_sum_post-num_{}.dat".format(self.result_dir, self.user_id), "w")word_sum_t = sorted(word_sum.iteritems(), key=lambda d: d[1][&#39;post_cnt&#39;], reverse=True)wf.write("| 分词 | 使用的文章数量 | 使用次数 |/n")for w in word_sum_t: wf.write("| {} | {} | {}|/n".format(w[0], w[1]["post_cnt"], w[1]["cnt"]))wf.close() 
# 全部文章title各词语 按使用次数 统计结果wf = open("{}/title_statis_sum_use-num_{}.dat".format(self.result_dir,self.user_id), "w")title_word_sum_t = sorted(title_word_sum.iteritems(), key=lambda d: d[1][&#39;cnt&#39;], reverse=True)wf.write("| 分词 | 使用次数 | 使用的文章数量|/n")for w in title_word_sum_t: wf.write("| {} | {} | {}|/n".format(w[0], w[1]["cnt"], w[1]["post_cnt"]))wf.close()# 全部文章title各词语 按使用次数 统计结果wf = open("{}/title_statis_sum_post-num_{}.dat".format(self.result_dir, self.user_id), "w")title_word_sum_t = sorted(title_word_sum.iteritems(), key=lambda d: d[1][&#39;post_cnt&#39;], reverse=True)wf.write("| 分词 | 使用的文章数量 | 使用次数 |/n")for w in title_word_sum_t: wf.write("| {} | {} | {}|/n".format(w[0], w[1]["post_cnt"], w[1]["cnt"]))wf.close()print("一共统计文章:{} 篇".format(len(list_dir)))print("所有正文-使用了2字及以上词语:{} 个".format(len(word_sum_t)))print("所有标题-使用了2字及以上词语:{} 个".format(len(title_word_sum_t))) 
if name == &#39;main&#39;: 
sp = Spider(start_url="http://www.jianshu.com/u/65fd4e5d930d") 
print("获取作者文章列表页面...") 
sp.post_list_page() 
print("获取作者所有文章页面...") 
#sp.posts_html() 
print("解析作者所有文章页面...") 
#sp.page_parsing() 
print("简单统计分析文章词汇...") 
#sp.statistics()

登入後複製

程式運行統計的結果見文章: 我統計了彭小六簡書360篇文章中使用的詞語

相信看了這些案例你已經掌握了方法，更多精彩請關注php中文網其它相關文章！