Python encounters encoding problems when writing files using multiple processes, but not when using multiple threads

Question

Using multiple processes to crawl data and write it to a file, no error was reported when running, but when opening the file, the code was garbled. There is no such problem when rewriting with multi-threading, everything is normal. The following is the code for writing data to a file: {code...} Process pool for multiple processes {code...}

巴扎黑 · Answer

As mentioned in the picture, the file is loaded in the wrong encoding, which means that when you write in multiple processes, the encoding is not utf-8

世界只因有你 · Answer

Add the first line of the file:

#coding: utf-8

我想大声告诉你 · Answer

Opening the same file is quite dangerous, and the probability of error is quite high. If
multi-threading does not make an error, it is most likely GIL,
multiple processes do not have locks, so it is easy to make errors.

url_text = codecs.open('url.txt','a','utf-8')

It is recommended to change to the producer-consumer model!

Like this

# -*- coding: utf-8 -* -
import time
import os
import codecs
import multiprocessing
import requests
from bs4 import BeautifulSoup

baseurl = ''
baseurl1 = ''
baseurl2 = ''
pageurl = ''
searchword = ''
header = {}

def fake(url, **kwargs):
    class Response(object):
        pass
    o = Response()
    o.content = 'foo'.format(url)
    return o

requests.get = fake


def Get_urls(start_page, end_page, queue):
    print('run task {} ({})'.format(start_page, os.getpid()))
    try:
        for i in range(start_page, end_page + 1):
            pageurl = baseurl1 + str(i) + baseurl2 + searchword
            response = requests.get(pageurl, headers=header)
            soup = BeautifulSoup(response.content, 'html.parser')
            a_list = soup.find_all('a')
            for a in a_list:
                if a.text != ''and 'wssd_content.jsp?bookid'in a['href']:
                    text = a.text.strip()
                    url = baseurl + str(a['href'])
                    queue.put(text + '	' + url + '
')
    except Exception as e:
        import traceback
        traceback.print_exc()


def write_file(queue):
    print("start write file")
    url_text = codecs.open('url.txt', 'a', 'utf-8')
    while True:
        line = queue.get()
        if line is None:
            break
        print("write {}".format(line))
        url_text.write(line)
    url_text.close()


def Multiple_processes_test():
    t1 = time.time()
    manager = multiprocessing.Manager()
    queue = manager.Queue()
    print 'parent process {} '.format(os.getpid())
    page_ranges_list = [(1, 3), (4, 6), (7, 9)]
    consumer = multiprocessing.Process(target=write_file, args=(queue,))
    consumer.start()
    pool = multiprocessing.Pool(processes=3)
    results = []
    for page_range in page_ranges_list:
        result = pool.apply_async(func=Get_urls,
                         args=(page_range[0],
                               page_range[1],
                               queue
                            ))
        results.append(result)
    pool.close()
    pool.join()
    queue.put(None)
    consumer.join()
    t2 = time.time()
    print '时间：', t2 - t1


if __name__ == '__main__':
    Multiple_processes_test()

Results

foo /4/wssd_content.jsp?bookid
foo /5/wssd_content.jsp?bookid
foo /6/wssd_content.jsp?bookid
foo /1/wssd_content.jsp?bookid
foo /2/wssd_content.jsp?bookid
foo /3/wssd_content.jsp?bookid
foo /7/wssd_content.jsp?bookid
foo /8/wssd_content.jsp?bookid
foo /9/wssd_content.jsp?bookid