Urllib2 and BeautifulSoup in python crawl data and save it in MongoDB
Beautiful Soup is a python library used to parse HTML and XML. It can parse files the way you like. Find and modify parse trees. It can handle irregular markup well and generate parse tree. It provides simple and common navigation, search and modify parse tree operations.
As shown in the figure, urllib2 and BS4 modules are used to crawl the html page data, which are title, content, stock name, stock ID, release time, and number of onlookers.
Example:
The code is as follows |
|
##-coding:utf-8-##
import time
from bs4 import BeautifulSoup
import urllib2
import pymongo
import re
import datetime
def update():
Datas = {}
Connection = pymongo.Connection('192.168.1.2', 27017)
#Connect mongodb
db = connection.test_hq
#Create or connect test_hq library
For i in soup.find_all("div", class_="item"):
datas['_id'] = str(i.h2.a['href']).split('/')[-1].split('.')[0]
#Get the html page name as id number
datas['title'] = i.h2.get_text()
#Get title
url2 = i.h2.a['href']
#Get the title content url address
html2 = urllib2.urlopen(url2)
html_doc2 = html2.read()
soup2 = BeautifulSoup(html_doc2)
datas['content'] = soup2.find(attrs={"name":"description"})['content']
#Get article content
stock_name = []
stock_id = []
For name in re.findall(u"[u4e00-u9fa5]+",i.find(class_="stocks").get_text()):
stock_name.append(name)
#Get the name of the affected stock, and save the corresponding stock ID number in an array. Mongo supports array insertion
datas['stock_name'] = stock_name
For id in re.findall("d+",i.find(class_="stocks").get_text()):
stock_id.append(id)
#Get the impact stock id
datas['stock_id'] = stock_id
datas['update_time'] = datetime.datetime.strptime(re.search("w+.*w+", i.find(class_="fl date").span.get_text()).group(), '%Y -%m-%d %H:%M') - datetime.timedelta(hours=8)
#Get the release time and convert it to mongo time format
datas['onlooker'] = int(re.search("d+",i.find(class_="icons ic-wg").get_text()).group())
#Get the number of onlookers
db.test.save(datas)
#Insert into database
def get_data():
Title = str(soup.h2.a['href']).split('/')[-1].split('.')[0]
#Get the html page name for update judgment
With open('update.txt', 'r') as f:
Time = f.readline()
If title == time:
print 'currently no update', title
else:
With open('update.txt', 'w') as f:
f.write(title)
update()
while True:
If __name__ == '__main__':
url = 'http://www.ipython.me/qingbao/'
html = urllib2.urlopen(url)
html_doc = html.read()
Soup = BeautifulSoup(html_doc)
get_data()
Time.sleep(30)
#Refresh every 30 seconds
|
http://www.bkjia.com/PHPjc/886552.htmlwww.bkjia.comtruehttp: //www.bkjia.com/PHPjc/886552.htmlTechArticleUrllib2 and BeautifulSoup in python crawl data and save MongoDB Beautiful Soup is a python library used to parse HTML and XML. It can parse files the way you like, find and modify...