python多线程抓取*子内容示例
使用re, urllib, threading 多线程抓取天涯帖子内容,设置url为需抓取的天涯帖子的第一页,设置file_name为下载后的文件名
代码如下:
#coding:utf-8
import urllib
import re
import threading
import os, time
class Down_Tianya(threading.Thread):
"""多线程下载"""
def __init__(self, url, num, dt):
threading.Thread.__init__(self)
self.url = url
self.num = num
self.txt_dict = dt
def run(self):
print 'downling from %s' % self.url
self.down_text()
def down_text(self):
"""根据传入的url抓出各页内容,按页数做键存入字典"""
html_content =urllib.urlopen(self.url).read()
text_pattern = re.compile('时间:(.*?).*?.*?
page_pattern = re.compile(r'(\d*)\s*下页')
page_result = page_pattern.search(html_page)
if page_result:
page_num = int(page_result.group(1))
return page_num
def write_text(dict, fn):
"""把字典内容按键(页数)写入文本,每个键值为每页内容的list列表"""
tx_file = open(fn, 'w+')
pn = len(dict)
for i in range(1, pn+1):
tx_list = dict[i]
for tx in tx_list:
tx = tx.replace('
', '\r\n').replace('
', '\r\n').replace(' ', '')
tx_file.write(tx.strip()+'\r\n'*4)
tx_file.close()
def main():
url = 'http://bbs.tianya.cn/post-16-996521-1.shtml'
file_name ='abc.txt'
my_page = page(url)
my_dict = {}
print 'page num is : %s' % my_page
threads = []
"""根据页数构造urls进行多线程下载"""
for num in range(1, my_page+1):
myurl = '%s%s.shtml' % (url[:-7], num)
downlist = Down_Tianya(myurl, num, my_dict)
downlist.start()
threads.append(downlist)
"""检查下载完成后再进行写入"""
for t in threads:
t.join()
write_text(my_dict, file_name)
print 'All download finished. Save file at directory: %s' % os.getcwd()
if __name__ == '__main__':
main()
down_tianya.py
代码如下:
#coding:utf-8
import urllib
import re
import threading
import os
class Down_Tianya(threading.Thread):
"""多线程下载"""
def __init__(self, url, num, dt):
threading.Thread.__init__(self)
self.url = url
self.num = num
self.txt_dict = dt
def run(self):
print 'downling from %s' % self.url
self.down_text()
def down_text(self):
"""根据传入的url抓出各页内容,按页数做键存入字典"""
html_content =urllib.urlopen(self.url).read()
text_pattern = re.compile('
page_pattern = re.compile(r'(\d*)\s*下页')
page_result = page_pattern.search(html_page)
if page_result:
page_num = int(page_result.group(1))
return page_num
def write_text(dict, fn):
"""把字典内容按键(页数)写入文本,每个键值为每页内容的list列表"""
tx_file = open(fn, 'w+')
pn = len(dict)
for i in range(1, pn+1):
tx_list = dict[i]
for tx in tx_list:
tx = tx.replace('
', '\r\n').replace('
', '\r\n').replace(' ', '')
tx_file.write(tx.strip()+'\r\n'*4)
tx_file.close()
def main():
url = 'http://bbs.tianya.cn/post-16-996521-1.shtml'
file_name ='abc.txt'
my_page = page(url)
my_dict = {}
print 'page num is : %s' % my_page
threads = []
"""根据页数构造urls进行多线程下载"""
for num in range(1, my_page+1):
myurl = '%s%s.shtml' % (url[:-7], num)
downlist = Down_Tianya(myurl, num, my_dict)
downlist.start()
threads.append(downlist)
"""检查下载完成后再进行写入"""
for t in threads:
t.join()
write_text(my_dict, file_name)
print 'All download finished. Save file at directory: %s' % os.getcwd()
if __name__ == '__main__':
main()

Hot AI Tools

Undresser.AI Undress
AI-powered app for creating realistic nude photos

AI Clothes Remover
Online AI tool for removing clothes from photos.

Undress AI Tool
Undress images for free

Clothoff.io
AI clothes remover

Video Face Swap
Swap faces in any video effortlessly with our completely free AI face swap tool!

Hot Article

Hot Tools

Notepad++7.3.1
Easy-to-use and free code editor

SublimeText3 Chinese version
Chinese version, very easy to use

Zend Studio 13.0.1
Powerful PHP integrated development environment

Dreamweaver CS6
Visual web development tools

SublimeText3 Mac version
God-level code editing software (SublimeText3)

Hot Topics



Solution to permission issues when viewing Python version in Linux terminal When you try to view Python version in Linux terminal, enter python...

How to teach computer novice programming basics within 10 hours? If you only have 10 hours to teach computer novice some programming knowledge, what would you choose to teach...

When using Python's pandas library, how to copy whole columns between two DataFrames with different structures is a common problem. Suppose we have two Dats...

How to avoid being detected when using FiddlerEverywhere for man-in-the-middle readings When you use FiddlerEverywhere...

How does Uvicorn continuously listen for HTTP requests? Uvicorn is a lightweight web server based on ASGI. One of its core functions is to listen for HTTP requests and proceed...

In Python, how to dynamically create an object through a string and call its methods? This is a common programming requirement, especially if it needs to be configured or run...

Using python in Linux terminal...

Understanding the anti-crawling strategy of Investing.com Many people often try to crawl news data from Investing.com (https://cn.investing.com/news/latest-news)...
