python实现爬虫下载漫画示例
代码如下:
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2
weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6
if len(sys.argv)>=3:
weburl=sys.argv[1]
floder=sys.argv[2]
else:
print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
sys.exit(0)
if len(sys.argv)>=4:
chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
threadcount=(int)(sys.argv[4])
def jin(i,jinzhi):
finalans=""
answer=i%jinzhi
i=int(i/jinzhi)
if answer>9:
finalans=finalans+chr(ord('a')+(answer-10))
else:
finalans=finalans+str(answer)
if i!=0:
finalans=jin(i,jinzhi)+finalans
return finalans
def urlparse(p,a,c,k):
d={}
e=lambda c: jin(c,36)
if 1:
while c:
c=c-1
if not k[c]:
d[jin(c,36)]=jin(c,36)
else:
d[jin(c,36)]=k[c]
k=[lambda e:d[e]]
e=lambda c:'\\w+'
c=1
newstr=""
while c:
c=c-1
if k[c]:
for i in range(0,len(p)):
tempi=p[i]
tempi=ord(tempi)
if tempi>=ord('a') and tempi newstr+=d[chr(tempi)]
elif tempi>=ord('0') and tempi newstr+=d[chr(tempi)]
else:
newstr+=chr(tempi)
return newstr
def meispower(s):
p=re.compile(r"(?=\}\().*",re.IGNORECASE)
s=p.findall(s)
s=s[0]
s=s[0:(len(s)-19)]
par=s.split(',')
par[3]=par[3][1:len(par[3])]
answer=par[3].split('|')
chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
allurl=allurl[10:(len(allurl)-2)]
return allurl
def pictofile(weburl,filename,loop=100):
if loop print('can\'t download the picture %s'%weburl)
return
loop=loop-1
if os.path.exists(filename):
return
try:
url=urllib.request.urlopen(weburl)
data=url.read()
if len(data) url.close()
pictofile(weburl,filename,loop)
else:
print('download from %s name is %s\n'%(weburl,filename))
myfile=open('%s'%filename,'wb')
myfile.write(data)
myfile.close()
url.close();
except socket.timeout:
print('timeout')
pictofile(weburl,filename,loop)
except Exception as e:
print('error',e)
pictofile(weburl,filename,loop)
finally:
pass
def downloadpic(url,loadpicdir,num):
#download the all url picture to loadpicdir
global currentthreadnum,mutex,mutex2
mymode=re.compile(r'[0-9a-z.]*\Z')
try:
mutex2.acquire()
os.chdir(loadpicdir)
mutex2.release()
except:
print("can't open the floder %s will be create"%loadpicdir)
try:
if(mutex2.locked()):
os.mkdir(loadpicdir)
os.chdir(loadpicdir)
mutex2.release()
print('create floder succeed')
except:
print("can't create floder %s"%loadpicdir)
if(mutex.acquire()):
mutex.release()
quit(0)
name=mymode.findall(url)
filename='manhua'+name[0]
pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
mutex.acquire()
currentthreadnum=currentthreadnum-1
mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
global manhuaweb,threadcount,currentthreadnum,mutex
print(manhuaweb+url)
webdata=urllib.request.urlopen(manhuaweb+url).read()
webdata=webdata.decode('UTF-8')
chaptername=re.findall(r'
chaptername=chaptername[7:len(chaptername)]
webscrip=re.findall(r'eval.*[^]',webdata)
chapterurl=meispower(webscrip[0]);
chapterurl='http://mhimg.ali213.net'+chapterurl
for i in range(begin,num):
try:
while(currentthreadnum>=threadcount):
time.sleep(0.5)
mutex.acquire()
currentthreadnum=currentthreadnum+1
mutex.release()
threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
except socket.error:
mutex.acquire()
i=i-1
currentthreadnum=currentthreadnum-1
mutex.release()
except Exception as error:
print(error,'break')
print('download chapter %d of picture make a error'%i)
break
if __name__=='__main__':
manhuaweb=r'http://manhua.ali213.net'
socket.setdefaulttimeout(60.0)
mutex=threading.Lock()
mutex2=threading.Lock()
webfile=urllib.request.urlopen(weburl)
webdata=webfile.read();
webdata=webdata.decode('UTF-8')
meshmode=re.compile(r'
meshdata=meshmode.findall(webdata)[0]
indexmode=re.compile(r'([0-9]*页)')
indexdata=indexmode.findall(meshdata)
picurlmode=re.compile(r'/comic/[0-9/]*.html')
picurldata=picurlmode.findall(meshdata)
chapterlength=len(picurldata)
nummode=re.compile(r'[\d]+')
i=chapterbegin
while i
downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
i=i+1

Hot AI Tools

Undresser.AI Undress
AI-powered app for creating realistic nude photos

AI Clothes Remover
Online AI tool for removing clothes from photos.

Undress AI Tool
Undress images for free

Clothoff.io
AI clothes remover

AI Hentai Generator
Generate AI Hentai for free.

Hot Article

Hot Tools

Notepad++7.3.1
Easy-to-use and free code editor

SublimeText3 Chinese version
Chinese version, very easy to use

Zend Studio 13.0.1
Powerful PHP integrated development environment

Dreamweaver CS6
Visual web development tools

SublimeText3 Mac version
God-level code editing software (SublimeText3)

Hot Topics

Tomato Novel software is a very rich reading platform. In addition to providing a large number of novel resources, it also covers a variety of comic resources. If you are interested in comics, Tomato Novels is definitely a choice not to be missed. There are many types of comics here, which are updated in real time. Whether it is the author or work you like, you can find it accurately. So how to read comics on Tomato Novel? This tutorial guide will give you a detailed introduction to the steps. I hope it can help everyone in need. How to read Tomato novel comics? 1. Open Tomato Novels APP. 2. Click on the comic. 3. Select the comic you like and click to watch. 4. Swipe down to read.

The time it takes to learn Python crawlers varies from person to person and depends on factors such as personal learning ability, learning methods, learning time and experience. Learning Python crawlers is not just about learning the technology itself, but also requires good information gathering skills, problem solving skills and teamwork skills. Through continuous learning and practice, you will gradually grow into an excellent Python crawler developer.

Analysis of common problems and solutions for PHP crawlers Introduction: With the rapid development of the Internet, the acquisition of network data has become an important link in various fields. As a widely used scripting language, PHP has powerful capabilities in data acquisition. One of the commonly used technologies is crawlers. However, in the process of developing and using PHP crawlers, we often encounter some problems. This article will analyze and give solutions to these problems and provide corresponding code examples. 1. Description of the problem that the data of the target web page cannot be correctly parsed.

In crawler development, handling cookies is often an essential part. As a state management mechanism in HTTP, cookies are usually used to record user login information and behavior. They are the key for crawlers to handle user authentication and maintain login status. In PHP crawler development, handling cookies requires mastering some skills and paying attention to some pitfalls. Below we explain in detail how to handle cookies in PHP. 1. How to get Cookie when writing in PHP

You can search and read comics on your own in the software Anime Home. Some users are still not sure why they can’t read comics in Anime Home. If the copyright of comics is purchased by other apps, they can’t read them. Next, the editor brings to users the comics they can’t read. An introduction to the reasons, interested users come and take a look! Why can’t I watch comics in Anime Home? Answer: I can’t watch comics due to copyright changes. Details: 1. Search for the comics you want to read in the software. 2. The comic content will not appear. It may be a second creation or cannot be found directly. 3. The copyright may have been purchased by other software. For example, it can only be read in the comic center of station b. 4. Due to copyright changes, some comics are no longer available and can only be read on copyrighted software.

Java crawler practice: How to efficiently crawl web page data Introduction: With the rapid development of the Internet, a large amount of valuable data is stored in various web pages. To obtain this data, it is often necessary to manually access each web page and extract the information one by one, which is undoubtedly a tedious and time-consuming task. In order to solve this problem, people have developed various crawler tools, among which Java crawler is one of the most commonly used. This article will lead readers to understand how to use Java to write an efficient web crawler, and demonstrate the practice through specific code examples. 1. The base of the reptile

As the film market continues to expand and develop, people's demand for films is also getting higher and higher. As for movie evaluation, Douban Film Critics has always been a more authoritative and popular choice. Sometimes, we also need to perform certain analysis and processing on Douban film reviews, which requires using crawler technology to obtain information about Douban film reviews. This article will introduce a tutorial on how to use PHP to crawl Douban movie reviews for your reference. Obtain the page address of Douban movies. Before crawling Douban movie reviews, you need to obtain the page address of Douban movies. OK

With the rapid development of Internet technology, Web applications are increasingly used in our daily work and life. In the process of web application development, crawling web page data is a very important task. Although there are many web scraping tools on the market, these tools are not very efficient. In order to improve the efficiency of web page data crawling, we can use the combination of PHP and Selenium. First, we need to understand what PHP and Selenium are. PHP is a powerful
