This is the error message:
Traceback (most recent call last):
File "D:\py\pic_downfrom2255ok.py", line 45, in <module>
html = getHtml(url_all[i])
File "D:\py\pic_downfrom2255ok.py", line 32, in getHtml
html = response.read().decode()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 184: invalid start byte
Many places have been changed. The main reason may be that the target website is encoded in gb2312.
This program can download pictures normally on other websites, but there will be problems when changing to the current website.
Please give me your advice. where is the problem? I tried several methods but nothing worked.
The source code is as follows:
#coding=utf-8
import urllib.request
from urllib.request import urlopen, urlretrieve
import urllib
import urllib.parse
import re
import os
from bs4 import BeautifulSoup
url_all =[
'http://www.shop2255.com/showpro/2603.html',
'http://www.shop2255.com/showpro/1558.html',
'http://www.shop2255.com/showpro/1564.html',
'http://www.shop2255.com/showpro/2411.html',
'http://www.shop2255.com/showpro/2409.html',
'http://www.shop2255.com/showpro/1561.html',
'http://www.shop2255.com/showpro/2414.html',
'http://www.shop2255.com/showpro/2609.html',
'http://www.shop2255.com/showpro/2413.html',
'http://www.shop2255.com/showpro/2604.html',
'http://www.shop2255.com/showpro/2605.html',
'http://www.shop2255.com/showpro/2606.html',
'http://www.shop2255.com/showpro/2608.html',
'http://www.shop2255.com/showpro/2607.html',
'http://www.shop2255.com/showpro/2610.html']
def getHtml(url):
response = urlopen(url)
html = response.read().decode("gbk")
return html
def getImg(html):
reg = 'src="(.+?\.jpg)"'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist
for i in range(len(url_all)):
html = getHtml(url_all[i])
list=getImg(html.decode())
x = 0
for imgurl in list:
print(x)
file_path = url_all[i]
(filepath,tempfilename) = os.path.split(file_path)
(filename,extension) = os.path.splitext(tempfilename)
if not os.path.exists('d:\%s' % filename):
os.mkdir('d:\%s' % filename)
# os.mkdir('D:\%s' % filename2)
local=r'D:\%s\%s.jpg' % (filename,imgurl.splite("/")[-1])
urllib.request.urlretrieve(imgurl,local)
x+=1
print("done")
First in your code
local=r'D:%s%s.jpg' % (filename,imgurl.splite("/")[-1])
中split
写成了splite
.Also
urllib.request.urlretrieve(imgurl,local)
这个imgurl
不是一个合法的url,只是一个相对 url, 要改成绝对 url,需要加上
base_url = 'http://www.shop2255.com/'
There also seems to be a problem with the generated file path.