Table of Contents
(.*?)
Home Backend Development Python Tutorial 多线程爬虫批量下载pcgame图片url 保存为xml的实现代码

多线程爬虫批量下载pcgame图片url 保存为xml的实现代码

Jun 16, 2016 am 08:46 AM
Multithreading Batch download reptile

复制代码 代码如下:

#coding=gbk
from xml.dom import minidom,Node
import urllib2,re,os
def readsrc(src):
    try:
        url = urllib2.urlopen(src)
        content = url.read()#.decode('utf-8')
        return content
    except:
        print 'error'
        return None
def pictype(content):
    '''
    通过抓取网站导航栏,获得网站的图片类型
    返回列表,每个列表元素为一个字典,addr代表图片类型对于的链接,name代表图片类型的名称
    错误会返回None
    '''
    p = re.compile(r'
    (.*)
',re.S)
    r=p.search(content)
    if r:
        content=r.group()
    else:
        print None
    p = re.compile(r'
  • \s*.*?)">(?P.*?)\s*\s*
  • ')

        l = [i.groupdict() for i in p.finditer(content)]
        l=l[1:]
        if len(l):return l
        else:return None
    def pageinfo(src):
        '''
        获取一个页面的详细信息
        返回对于的字典列表
        name:图片的名字
        cutaddr:缩小的浏览图
        picaddr:实际图片的地址
        '''
        d=os.path.split(src)[0]
        try:
            url = urllib2.urlopen(src)
            content = url.read()#.decode('utf-8')
        except:
            print 'error'
            return None
        #find all the pictures info in a page
        p = re.compile(r'

    (.*?)',re.S)
        r = p.findall(content)
        if not r: return None
        r = r[1]
        p = re.compile(r'
  • 多线程爬虫批量下载pcgame图片url 保存为xml的实现代码.*?)" * */>.*?
  • ')
        l = [ i.groupdict() for i in p.finditer(r)]
        for i in l:
            i['picaddr']=d+'/'+i['picaddr']
        if len(l): return l
        else: return None

    def nextpageaddr(src):
        '''
        从页面的html源码中获取下一个页面地址的名称,最后一页返回None
        '''
        content=readsrc(src)
        p = re.compile(r'')
        r = p.search(content)
        if r:
            return os.path.dirname(src)+"/"+r.group(1)
        else:
            return None
    def picinfoaddr(src):
        '''
        参数相册图集的html代码
        返回全部图片的相对地址
        '''
        content=readsrc(src)
        p = re.compile(r'

    .*?.*?
    ',re.S)
        r = p.search(content)
        if r:
            return os.path.dirname(src)+"/"+r.group(1)
        else:
            return None
    def parseinfo(content):
        '''
        读取全部图片html代码,获得一个相册的详细信息
        kw:关键字
        title:标题
        type:类型
        pic:各个图片的地址列表,末尾加上_220x165,_medium,_small 可以得到不同大小的图片
        '''
        info={}
        temp=str()

        #title
        temp=''
        r=re.search('

    (.*?)

    ',content)#get the pic title
        if r:
            temp = r.group(1)
        info['title']=temp

        #keyword
        temp=''
        r=re.search('',content)
        if r:
            temp = r.group(1)
        info['kw']=temp

        #type
        r=re.findall('(.*?).*?>',content)
        if r:
            info['type']=':'.join(r)
        else:
            info['type']=''
        r=re.search('

      (.*?)
    ',content,re.S)
        if not r:return None
        content=r.group(1)#filter content
    #    print content
        r=re.findall('',content)

        for index,i in enumerate(r):
            r[index]=i[0:i.rfind('_')]
    #        print r[index]
        info['pic']=r
        return info
    import threading
    class mthread(threading.Thread):
        def __init__(self,tp,addr,lock):
            threading.Thread.__init__(self)
    #        self.doc = minidom.Document()
            self.doc=minidom.Document()
            self.tp=tp
            self.lock=lock
            self.addr=addr
            self.thread_stop=False
            self.picdoc=None
        def run(self):
            self.picdoc = self.doc.createElement('urlclass')
    #        print self.tp
            self.picdoc.setAttribute('type',self.tp)
    #        self.doc.appendChild(self.picdoc)
            m=pageinfo(self.addr)
            while self.addr:
                for i in m:
    #                print i['picaddr']
                    picaddr=picinfoaddr(i['picaddr'])
    #                print picaddr
                    info=parseinfo(readsrc(picaddr))
                    name=info['title']


                    picture=doc.createElement('picture')

                    title = doc.createElement('title')
                    title.appendChild(doc.createTextNode(info['title']))
                    picture.appendChild(title)

                    keyword = doc.createElement('keywords')
                    keyword.appendChild(doc.createTextNode(info['kw']))
                    picture.appendChild(keyword)

                    tp = doc.createElement('pictype')
                    tp.appendChild(doc.createTextNode(info['type']))
                    picture.appendChild(tp)

                    cuturl = doc.createElement('piccut')
                    cuturl.appendChild(doc.createTextNode(i['cutaddr']))
                    picture.appendChild(cuturl)

                    urls = doc.createElement('urls')
                    self.lock.acquire()
                    print 'downloading ',name
                    self.lock.release()
                    for picurl in info['pic']:
                        singleurl=doc.createElement('url')
                        singleurl.appendChild(doc.createTextNode(picurl+'.jpg'))
                        urls.appendChild(singleurl)

                    picture.appendChild(urls)
                    self.picdoc.appendChild(picture)
                m=pageinfo(self.addr)
                self.addr=nextpageaddr(self.addr)
    #        f = open('c:\\'+self.tp+'.xml','w')
    #        f.write(doc.toprettyxml(indent = ''))
    #        f.close()
        def stop(self):
            self.thread_stop=True


    path='C:\\pict\\'#下载的路径
    #import sys
    sys.exit(12)
    content=readsrc('http://photos.pcgames.com.cn/cate/3/1.html')
    r=pictype(content)
    lt=[]
    doc = minidom.Document()
    root=doc.createElement('url_resource')
    root.setAttribute('type','url')
    root.setAttribute('urltype','image')
    root.setAttribute('imgfmt','jpg')
    doc.appendChild(root)
    lock=threading.RLock()
    for iaddr in r:
        print 'downloading type: ',iaddr['name']
        addr=iaddr['addr']
        th=mthread(iaddr['name'],addr,lock)
        lt.append(th)
        th.start()
    for t in lt:
        t.join()
        root.appendChild(t.picdoc)

    print 'write'
    f = open('c:\\'+'urls'+'.xml','w')
    f.write(doc.toprettyxml(indent = ''))
    f.close()
    print doc.toprettyxml()
    print 'end'

    Statement of this Website
    The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn

    Hot AI Tools

    Undresser.AI Undress

    Undresser.AI Undress

    AI-powered app for creating realistic nude photos

    AI Clothes Remover

    AI Clothes Remover

    Online AI tool for removing clothes from photos.

    Undress AI Tool

    Undress AI Tool

    Undress images for free

    Clothoff.io

    Clothoff.io

    AI clothes remover

    AI Hentai Generator

    AI Hentai Generator

    Generate AI Hentai for free.

    Hot Article

    R.E.P.O. Energy Crystals Explained and What They Do (Yellow Crystal)
    2 weeks ago By 尊渡假赌尊渡假赌尊渡假赌
    Repo: How To Revive Teammates
    4 weeks ago By 尊渡假赌尊渡假赌尊渡假赌
    Hello Kitty Island Adventure: How To Get Giant Seeds
    3 weeks ago By 尊渡假赌尊渡假赌尊渡假赌

    Hot Tools

    Notepad++7.3.1

    Notepad++7.3.1

    Easy-to-use and free code editor

    SublimeText3 Chinese version

    SublimeText3 Chinese version

    Chinese version, very easy to use

    Zend Studio 13.0.1

    Zend Studio 13.0.1

    Powerful PHP integrated development environment

    Dreamweaver CS6

    Dreamweaver CS6

    Visual web development tools

    SublimeText3 Mac version

    SublimeText3 Mac version

    God-level code editing software (SublimeText3)

    C++ function exceptions and multithreading: error handling in concurrent environments C++ function exceptions and multithreading: error handling in concurrent environments May 04, 2024 pm 04:42 PM

    Function exception handling in C++ is particularly important for multi-threaded environments to ensure thread safety and data integrity. The try-catch statement allows you to catch and handle specific types of exceptions when they occur to prevent program crashes or data corruption.

    Usage of JUnit unit testing framework in multi-threaded environment Usage of JUnit unit testing framework in multi-threaded environment Apr 18, 2024 pm 03:12 PM

    There are two common approaches when using JUnit in a multi-threaded environment: single-threaded testing and multi-threaded testing. Single-threaded tests run on the main thread to avoid concurrency issues, while multi-threaded tests run on worker threads and require a synchronized testing approach to ensure shared resources are not disturbed. Common use cases include testing multi-thread-safe methods, such as using ConcurrentHashMap to store key-value pairs, and concurrent threads to operate on the key-value pairs and verify their correctness, reflecting the application of JUnit in a multi-threaded environment.

    How can concurrency and multithreading of Java functions improve performance? How can concurrency and multithreading of Java functions improve performance? Apr 26, 2024 pm 04:15 PM

    Concurrency and multithreading techniques using Java functions can improve application performance, including the following steps: Understand concurrency and multithreading concepts. Leverage Java's concurrency and multi-threading libraries such as ExecutorService and Callable. Practice cases such as multi-threaded matrix multiplication to greatly shorten execution time. Enjoy the advantages of increased application response speed and optimized processing efficiency brought by concurrency and multi-threading.

    How do PHP functions behave in a multi-threaded environment? How do PHP functions behave in a multi-threaded environment? Apr 16, 2024 am 10:48 AM

    In a multi-threaded environment, the behavior of PHP functions depends on their type: Normal functions: thread-safe, can be executed concurrently. Functions that modify global variables: unsafe, need to use synchronization mechanism. File operation function: unsafe, need to use synchronization mechanism to coordinate access. Database operation function: Unsafe, database system mechanism needs to be used to prevent conflicts.

    How to implement multi-threading in PHP? How to implement multi-threading in PHP? May 06, 2024 pm 09:54 PM

    PHP multithreading refers to running multiple tasks simultaneously in one process, which is achieved by creating independently running threads. You can use the Pthreads extension in PHP to simulate multi-threading behavior. After installation, you can use the Thread class to create and start threads. For example, when processing a large amount of data, the data can be divided into multiple blocks and a corresponding number of threads can be created for simultaneous processing to improve efficiency.

    How to deal with shared resources in multi-threading in C++? How to deal with shared resources in multi-threading in C++? Jun 03, 2024 am 10:28 AM

    Mutexes are used in C++ to handle multi-threaded shared resources: create mutexes through std::mutex. Use mtx.lock() to obtain a mutex and provide exclusive access to shared resources. Use mtx.unlock() to release the mutex.

    Challenges and countermeasures of C++ memory management in multi-threaded environment? Challenges and countermeasures of C++ memory management in multi-threaded environment? Jun 05, 2024 pm 01:08 PM

    In a multi-threaded environment, C++ memory management faces the following challenges: data races, deadlocks, and memory leaks. Countermeasures include: 1. Use synchronization mechanisms, such as mutexes and atomic variables; 2. Use lock-free data structures; 3. Use smart pointers; 4. (Optional) implement garbage collection.

    Challenges and strategies for testing multi-threaded programs in C++ Challenges and strategies for testing multi-threaded programs in C++ May 31, 2024 pm 06:34 PM

    Multi-threaded program testing faces challenges such as non-repeatability, concurrency errors, deadlocks, and lack of visibility. Strategies include: Unit testing: Write unit tests for each thread to verify thread behavior. Multi-threaded simulation: Use a simulation framework to test your program with control over thread scheduling. Data race detection: Use tools to find potential data races, such as valgrind. Debugging: Use a debugger (such as gdb) to examine the runtime program status and find the source of the data race.

    See all articles