When Node.js crawls non-utf-8 Chinese web pages, garbled characters will appear. For example, NetEase’s homepage encoding is gb2312, and garbled characters will appear when crawling
var request = require('request')
var url = 'http://www.163.com'
request(url, function (err, res, body) {
console.log(body)
})
You can use
iconv-lite to solve
Installation
npm install iconv-lite
At the same time, let’s modify the user-agent to prevent the website from being blocked:
var originRequest = require('request')
var iconv = require('iconv-lite')
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS
}
function request (url, callback) {
var options = {
url: url,
encoding: null,
headers: headers
}
originRequest(options, callback)
}
request(url, function (err, res, body) {
var html = iconv.decode(body, 'gb2312')
console.log(html)
})
Garbled code problem solved
Use cheerio to parse HTML
cheerio can be simply and crudely understood as a server-side jQuery selector. With it, it is much more intuitive than regular expressions
Installation
npm install cheerio
request(url, function (err, res, body) {
var html = iconv.decode(body, 'gb2312')
var $ = cheerio.load(html)
console.log($('h1').text())
console.log($('h1').html())
})
The output is as follows
NetEase
NetEase
Then here comes the problem. The code output by $('h1').html() is Unicode encoded. NetEase has become NetEase, which brings some trouble to our character processing
Solve the "garbled" problem of cheerio .html()
Check the document to find out that you can turn off the function of converting entity encoding
var $ = cheerio.load(html)
Change to
var $ = cheerio.load(html, {decodeEntities: false})
That’s it, the complete code is as follows:
Copy code The code is as follows:
var originRequest = require('request')
var cheerio = require('cheerio')
var iconv = require('iconv-lite')
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
function request (url, callback) {
var options = {
url: url,
encoding: null,
headers: headers
}
originRequest(options, callback)
}
var url = 'http://www.163.com'
request(url, function (err, res, body) {
var html = iconv.decode(body, 'gb2312')
var $ = cheerio.load(html, {decodeEntities: false})
console.log($('h1').text())
console.log($('h1').html())
})