Python用多进程写文件遇到编码问题,而用多线程却不会
我想大声告诉你
我想大声告诉你 2017-06-15 09:21:36
0
3
1116

用多进程爬取数据写入文件,运行没有报错,但是打开文件却乱码。

用多线程改写时却没有这个问题,一切正常。
下面是数据写入文件的代码:

def Get_urls(start_page,end_page):
    print ' run task {} ({})'.format(start_page,os.getpid())
 
    url_text = codecs.open('url.txt','a','utf-8')
        for i in range(start_page,end_page+1):
            pageurl=baseurl1+str(i)+baseurl2+searchword
            response = requests.get(pageurl, headers=header)
            soup = BeautifulSoup(response.content, 'html.parser')
            a_list=soup.find_all('a')
            for a in a_list:
                if a.text!=''and 'wssd_content.jsp?bookid'in a['href']:
                    text=a.text.strip()
                    url=baseurl+str(a['href'])
                    url_text.write(text+'\t'+url+'\n')
        url_text.close()

多进程用的进程池

def Multiple_processes_test():
    t1 = time.time()
    print 'parent process {} '.format(os.getpid())
    page_ranges_list = [(1,3),(4,6),(7,9)]
    pool = multiprocessing.Pool(processes=3)
    for page_range in page_ranges_list:
        pool.apply_async(func=Get_urls,args=(page_range[0],page_range[1]))
    pool.close()
    pool.join()
    t2 = time.time()
    print '时间:',t2-t1
我想大声告诉你
我想大声告诉你

全部回复(3)
巴扎黑

图片上已经说了,文件以错误的编码形式载入了,说明你多进程写入的时候,编码不是utf-8

世界只因有你

文件第一行添加:

#coding: utf-8
我想大声告诉你

打开同一个文件,相当危险,出错机率相当大,
多线程不出错,极有可能是GIL,
多进程没有锁,因此容易出错了。

url_text = codecs.open('url.txt','a','utf-8')

建议改为生产者消费都模式!

比如这样

# -*- coding: utf-8 -* -
import time
import os
import codecs
import multiprocessing
import requests
from bs4 import BeautifulSoup

baseurl = ''
baseurl1 = ''
baseurl2 = ''
pageurl = ''
searchword = ''
header = {}

def fake(url, **kwargs):
    class Response(object):
        pass
    o = Response()
    o.content = '<a href="/{}/wssd_content.jsp?bookid">foo</a>'.format(url)
    return o

requests.get = fake


def Get_urls(start_page, end_page, queue):
    print('run task {} ({})'.format(start_page, os.getpid()))
    try:
        for i in range(start_page, end_page + 1):
            pageurl = baseurl1 + str(i) + baseurl2 + searchword
            response = requests.get(pageurl, headers=header)
            soup = BeautifulSoup(response.content, 'html.parser')
            a_list = soup.find_all('a')
            for a in a_list:
                if a.text != ''and 'wssd_content.jsp?bookid'in a['href']:
                    text = a.text.strip()
                    url = baseurl + str(a['href'])
                    queue.put(text + '\t' + url + '\n')
    except Exception as e:
        import traceback
        traceback.print_exc()


def write_file(queue):
    print("start write file")
    url_text = codecs.open('url.txt', 'a', 'utf-8')
    while True:
        line = queue.get()
        if line is None:
            break
        print("write {}".format(line))
        url_text.write(line)
    url_text.close()


def Multiple_processes_test():
    t1 = time.time()
    manager = multiprocessing.Manager()
    queue = manager.Queue()
    print 'parent process {} '.format(os.getpid())
    page_ranges_list = [(1, 3), (4, 6), (7, 9)]
    consumer = multiprocessing.Process(target=write_file, args=(queue,))
    consumer.start()
    pool = multiprocessing.Pool(processes=3)
    results = []
    for page_range in page_ranges_list:
        result = pool.apply_async(func=Get_urls,
                         args=(page_range[0],
                               page_range[1],
                               queue
                            ))
        results.append(result)
    pool.close()
    pool.join()
    queue.put(None)
    consumer.join()
    t2 = time.time()
    print '时间:', t2 - t1


if __name__ == '__main__':
    Multiple_processes_test()

结果

foo /4/wssd_content.jsp?bookid
foo /5/wssd_content.jsp?bookid
foo /6/wssd_content.jsp?bookid
foo /1/wssd_content.jsp?bookid
foo /2/wssd_content.jsp?bookid
foo /3/wssd_content.jsp?bookid
foo /7/wssd_content.jsp?bookid
foo /8/wssd_content.jsp?bookid
foo /9/wssd_content.jsp?bookid

热门教程
更多>
最新下载
更多>
网站特效
网站源码
网站素材
前端模板