Detailed introduction to the five modules in python-Python Tutorial-php.cn

This python crawler encyclopedia provides a detailed analysis of the steps of the crawler, with detailed annotations for each step of the code. You can master the characteristics of the python crawler through this case:

1. Crawler scheduling entrance (crawler_main.py)

# coding:utf-8
from com.wenhy.crawler_baidu_baike import url_manager, html_downloader, html_parser, html_outputer

print "爬虫百度百科调度入口"

# 创建爬虫类
class SpiderMain(object):

    # 初始化 url管理器 html下载器 解析器 输出器
    def __init__(self):
        self.urls = url_manager.UrlManager()
        self.downloader = html_downloader.HtmlDownloader()
        self.parser = html_parser.HtmlParser()
        self.output = html_outputer.HtmlOutput()


def craw(urls, downloader, parser, output, root_url):
    # 计数
    count = 1
    # 添加url到url管理器中
    urls.add_new_url(root_url)
    # 判断是否有新的URL
    while urls.has_new_url():
        try:

            # 获取新的URL
            new_url = urls.get_new_url()
            print 'crawler %d : %s' % (count, new_url)
            # 下载html页面数据
            html_cont = downloader.download(new_url)
            # 解析页面得到新的url列表，新的数据
            new_urls, new_data = parser.parser(new_url, html_cont)
            # 把解析到的url数组批量添加到url管理器中
            urls.add_new_urls(new_urls)
            # 收集数据
            output.collect_data(new_data)
            # 爬虫1000页面
            if count == 500:
                break
            count = count + 1

        except Exception as e:
            print 'Crawler Failed ', e

    output.output_html()


if __name__ == '__main__':
    # 入口URL 百度百科地址
    root_url = "http://baike.baidu.com/item/Python"
    # 创建爬虫
    obj_spider = SpiderMain()
    # 启动爬虫
    craw(obj_spider.urls, obj_spider.downloader, obj_spider.parser, obj_spider.output, root_url)

2、封装URL管理器（url_manager.py）

Copy after login

# coding:utf-8

print "URL管理器"


class UrlManager(object):

    # 初始化url容器（set集合）
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    # 添加一个新的待爬取url
    def add_new_url(self, url):

        if url is None:
            return

        # 判断url不在新的url集合中也不再旧的url集合中 说明是一个全新的url
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    # 批量添加解析页面的URL
    def add_new_urls(self, urls):

        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    # 判断是否有新的待爬取的url
    def has_new_url(self):
        # 如果新的url集合里面len不等于0 说明有待爬取的url
        return len(self.new_urls) != 0

    # 获得url地址
    def get_new_url(self):
        # 获取url并移除当前url
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url

3、HTML下载器（html_downloader.py）

Copy after login

# coding:utf-8

import urllib2
print "下载HTML"


class HtmlDownloader(object):

    def download(self, url):
        # 判断是否为空
        if url is None:
            return None
        # 下载url
        response = urllib2.urlopen(url)
        # 判断返回结果是否为200
        if response.getcode() != 200:
            return None
        # 成功 返回页面内容
        return response.read()

4、HTML 解析器(html_parser.py)

Copy after login

# coding:utf-8

from bs4 import BeautifulSoup
import re
import urlparse

print "HTML 解析器"


class HtmlParser(object):

    def _get_new_urls(self, page_url, soup):

        # 创建一个集合保存页面解析出来的所有url
        new_urls = set()
        # /item/  词条url
        links = soup.find_all('a', href=re.compile(r'/item/'))
        for link in links:
            # 获取连接
            new_url = link['href']
            # url拼接
            new_full_url = urlparse.urljoin(page_url, new_url)
            # print 'new_full_url', new_full_url
            # 添加到集合中url
            new_urls.add(new_full_url)
        return new_urls


    def _get_new_data(self, page_url, soup):
        # 定义一个dict 集合
        res_data = {}

        res_data['url'] = page_url
        # < dd class ="lemmaWgt-lemmaTitle-title" >< h1 > 自由软件 < / h1 >
        title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
        res_data['title'] = title_node.get_text()
        # <div class="lemma-summary" label-module="lemmaSummary">
        summary_node = soup.find('div', class_='lemma-summary')
        res_data['summary'] = summary_node.get_text()

        return res_data


    # 解析器
    def parser(self, page_url, html_content):

        if page_url is None or html_content is None:
            return

        # 使用BeautifulSoup 解析html页面
        soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
        new_urls = self._get_new_urls(page_url, soup)
        new_data = self._get_new_data(page_url, soup)
        return new_urls, new_data

5、HTML 输出页面（html_outputer.py）

Copy after login

# coding:utf-8

print "HTML 输出页面"


class HtmlOutput(object):

    def __init__(self):
        self.datas = []

    # 收集数据
    def collect_data(self, data):
        if data is None:
            return
        self.datas.append(data)

    # 展示数据
    def output_html(self):

        fout = open('output.html', 'w')

        fout.write('<html>')
        fout.write('<head>')
        fout.write('<meta http-equiv="Content-Type" content="text/html;charset=utf-8">')
        fout.write('</head>')
        fout.write('<body>')
        fout.write('<table border="1">')
        for data in self.datas:
            fout.write('<tr>')
            fout.write('<td><a href = "%s">%s</a></td>' % (data['url'].encode('utf-8'), data['title'].encode('utf-8')))
            fout.write('<td>%s</td>' % data['summary'].encode('utf-8'))
            fout.write('</tr>')
        fout.write('</table>')
        fout.write('</body>')
        fout.write('</html>')

        fout.close()


总结：python爬虫主要就是五个模块：爬虫启动入口模块，URL管理器存放已经爬虫的URL和待爬虫URL列表，html下载器，html解析器，html输出器
     同时可以掌握到urllib2的使用、bs4（BeautifulSoup）页面解析器、re正则表达式、urlparse、python基础知识回顾（set集合操作）等相关内容。

Copy after login

The above is the detailed content of Detailed introduction to the five modules in python. For more information, please follow other related articles on the PHP Chinese website!