Crawler-Bilder – Bitte sagen Sie mir: Python-Crawler-Codierungsproblem, Version 3.6, Win10 64-Bit?
伊谢尔伦
伊谢尔伦 2017-05-18 10:53:14
0
2
971

Das ist die Fehlermeldung:

Traceback (most recent call last):
  File "D:\py\pic_downfrom2255ok.py", line 45, in <module>
    html = getHtml(url_all[i])
  File "D:\py\pic_downfrom2255ok.py", line 32, in getHtml
    html = response.read().decode()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 184: invalid start byte

Der Hauptgrund dafür ist möglicherweise, dass die Zielwebsite in GB2312 codiert ist.
Dieses Programm kann Bilder normalerweise auf andere Websites herunterladen. Beim Wechsel zur aktuellen Website wird es jedoch Probleme geben ein paar Ratschläge. Wo liegt das Problem? Ich habe mehrere Methoden ausprobiert, aber nichts hat funktioniert. Der Quellcode lautet wie folgt:

#coding=utf-8
import urllib.request
from urllib.request import urlopen, urlretrieve 
import urllib
import urllib.parse
import re
import os
from bs4 import BeautifulSoup


url_all =[
'http://www.shop2255.com/showpro/2603.html',
'http://www.shop2255.com/showpro/1558.html',
'http://www.shop2255.com/showpro/1564.html',
'http://www.shop2255.com/showpro/2411.html',
'http://www.shop2255.com/showpro/2409.html',
'http://www.shop2255.com/showpro/1561.html',
'http://www.shop2255.com/showpro/2414.html',
'http://www.shop2255.com/showpro/2609.html',
'http://www.shop2255.com/showpro/2413.html',
'http://www.shop2255.com/showpro/2604.html',
'http://www.shop2255.com/showpro/2605.html',
'http://www.shop2255.com/showpro/2606.html',
'http://www.shop2255.com/showpro/2608.html',
'http://www.shop2255.com/showpro/2607.html',
'http://www.shop2255.com/showpro/2610.html']

def getHtml(url):
    response = urlopen(url)
    html = response.read().decode("gbk")
    return html


def getImg(html):
    reg = 'src="(.+?\.jpg)"'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)

    return imglist

for i in range(len(url_all)):
    html = getHtml(url_all[i])
    list=getImg(html.decode())
    x = 0
    for imgurl in list:
        print(x)
        file_path = url_all[i]
        (filepath,tempfilename) = os.path.split(file_path)
        (filename,extension) = os.path.splitext(tempfilename)
        
        if not os.path.exists('d:\%s' % filename):
            os.mkdir('d:\%s' % filename)
        # os.mkdir('D:\%s' % filename2)
        
        local=r'D:\%s\%s.jpg' % (filename,imgurl.splite("/")[-1])
        urllib.request.urlretrieve(imgurl,local)
        x+=1
print("done")
伊谢尔伦
伊谢尔伦

小伙看你根骨奇佳,潜力无限,来学PHP伐。

Antworte allen(2)
Peter_Zhu
# coding: utf-8

import urllib
import requests
from pyquery import PyQuery as Q
import os

base_url = 'http://www.shop2255.com/'


url_all =['http://www.shop2255.com/showpro/2603.html']


for url in url_all:
    _, file_name = os.path.split(url)
    dir_name, _ = os.path.splitext(file_name)

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    r = requests.get(url)
    for _ in Q(r.text).find('img'):
        src = Q(_).attr('src')
        image_url = src if src.startswith('http') else os.path.join(base_url, src)
        _, image_name = os.path.split(image_url)

        image_path = os.path.join(dir_name, image_name)
        urllib.urlretrieve(image_url, image_path)
漂亮男人

首先在你这个代码里面 local=r'D:\%s\%s.jpg' % (filename,imgurl.splite("/")[-1])split写成了splite.

还有 urllib.request.urlretrieve(imgurl,local)这个imgurl不是一个合法的
url,只是一个相对 url, 要改成绝对 url,需要加上 base_url = 'http://www.shop2255.com/'

还有生成的文件路径好像也有问题.

# -*- coding: utf-8 -*-

import urllib.request
from urllib.request import urlopen, urlretrieve
import urllib
import urllib.parse
import re
import os
from bs4 import BeautifulSoup

base_url = 'http://www.shop2255.com/'

url_all =[
'http://www.shop2255.com/showpro/2603.html',
'http://www.shop2255.com/showpro/1558.html',
'http://www.shop2255.com/showpro/1564.html',
'http://www.shop2255.com/showpro/2411.html',
'http://www.shop2255.com/showpro/2409.html',
'http://www.shop2255.com/showpro/1561.html',
'http://www.shop2255.com/showpro/2414.html',
'http://www.shop2255.com/showpro/2609.html',
'http://www.shop2255.com/showpro/2413.html',
'http://www.shop2255.com/showpro/2604.html',
'http://www.shop2255.com/showpro/2605.html',
'http://www.shop2255.com/showpro/2606.html',
'http://www.shop2255.com/showpro/2608.html',
'http://www.shop2255.com/showpro/2607.html',
'http://www.shop2255.com/showpro/2610.html']

def getHtml(url):
    response = urlopen(url)
    # print(response.read())
    html = response.read().decode("gbk")
    print(html)
    return html


def getImg(html):
    reg = 'src="(.+?\.jpg)"'
    imgre = re.compile(reg)
    imglist = re.findall(imgre, html)
    return imglist

for i in range(len(url_all)):
    html = getHtml(url_all[i])
    # 注意: 我这里没有你那个错误,我只需要改这个就行了
    # list = getImg(html.decode())
    list = getImg(html)
    # print(list)
    x = 0
    for imgurl in list:
        print(x)
        file_path = url_all[i]
        (filepath, tempfilename) = os.path.split(file_path)
        (filename, extension) = os.path.splitext(tempfilename)

        if not os.path.exists('d:\%s' % filename):
            os.mkdir('d:\%s' % filename)
        # os.mkdir('D:\%s' % filename2)

        local = r'D:\%s\%s.jpg' % (filename, imgurl.split("/")[-1])
        try:
            urllib.request.urlretrieve(base_url + imgurl, local)
        except:
            print("can't retrieve the" + base_url + imgurl)
        x += 1

print("done")
Beliebte Tutorials
Mehr>
Neueste Downloads
Mehr>
Web-Effekte
Quellcode der Website
Website-Materialien
Frontend-Vorlage