The current idea is as follows: 1. Get all tag page URLs under the category, 2. Loop through the crawl page to grab the current tag page to get the json api address, 3. Grab the product list of the current tag, 4. Grab Get the products being loaded in the current label's page.
But what we have done now is that at the beginning of the second step, we did not wait until 2-4 was completed before looping to the next one. We tried to use async/await, but the process control was not implemented. Asking for advice here.
var http = require('http');
var fs = require("fs");
var superagent = require('superagent');
var urls = [];
var pageIndex = 1;
var xlsxData = '';
getGoodsUrl(urls);
function getGoodsUrl(urls){
superagent
.post('http://bravetime.davdian.com/index.php?c=Index&a=getCatNavList')
.type('text/html; charset=utf-8')
.set('Accept','application/json, text/javascript, */*; q=0.01')
.end(function(err, res) {
if (err) {
console.log('分类数据请求失败');
} else {
console.log('分类数据请求成功');
var resData = res.text;
var resData = JSON.parse(resData);
if(resData.data.length > 0){
resData.data.forEach(function(item){
var rowObj = [];
var title = item.title;
var category = item.content.category;
category.forEach(function(item){
var text = [];
text.push(title+ '--' + item.text);
text.push(item.link);
rowObj.push(text);
});
urls.push(rowObj);
});
loopUrls(urls);
} else {
console.log('分类数据为空');
}
// saveInfo(xlsxData);
}
})
}
function loopUrls(urls){
urls.forEach(function(item){
var row = item;
row.forEach(function(item){
var tagTitie = item[0];
var tegUrl = item[1];
getApiUrl(tagTitie,tegUrl);
});
});
}
function getApiUrl(title,url){
var realUrl = 'http://bravetime.davdian.com' + url;
http.get(realUrl,function(res){
var html = '';
res.on('data',function(data){
html += data;
});
res.on('end',function(){
console.log('正在获取' + title + '页面数据');
var reg = /goodsUrl = "(.+)"/;
var apiUrl = reg.exec(html);
getGoodsJson(apiUrl[1],pageIndex);
});
}).on('error',function(){
console.log('获取html出错!!');
});
}
function getGoodsJson(url,pageIndex){
superagent
.post('http://bravetime.davdian.com/' + url + 'page_size=10&rp=catergory_search&rl=list')
.send({page:pageIndex})
.type('application/x-www-form-urlencoded; charset=UTF-8')
.set('Accept','application/json, text/javascript, */*; q=0.01')
.end(function(err, res) {
if (err) {
console.log('第' + pageIndex + '页请求失败');
} else {
console.log('第' + pageIndex + '页请求成功');
var resData = res.text;
var resData = JSON.parse(resData);
if(resData.data.length > 0){
resData.data.forEach(function(item){
xlsxData = xlsxData + item.goods_name + ' ' + item.shop_price + ' ' + item.goods_number + '\r\n';
});
pageIndex = parseInt(pageIndex) + 1;
setTimeout(function(){
getGoodsJson(url,pageIndex);
},200);
} else {
console.log('数据已加载完毕');
saveTxt(xlsxData);
pageIndex = 1;
return false;
}
// saveInfo(xlsxData);
}
})
}
function saveTxt(data){
fs.writeFile("create.txt",data,function (err) {
if (err) throw err ;
console.log("File Saved !"); //文件被保存
}) ;
}
function saveInfo(data){
var buffer = xlsx.build([{name: "mySheetName", data: data}]);
fs.writeFileSync("myFile.xlsx", buffer, 'binary');
console.log('excel保存成功');
}
The following is the result diagram and code execution sequence:
generator
async
promise
Your entire process is asynchronous, and you don’t see any meaning of synchronization. I think you may not understand what asynchronous is.
Async/await is based on Promise, and Superagent itself supports Promise. You can use async/await directly.
http://visionmedia.github.io/...
http://www.ruanyifeng.com/blo...
Then all you need is to replace
http.get()
withsuperagent.get()
.Generally, people don’t have the patience to read other people’s business logic.
As mentioned above, Async/await is based on Promise. If the API interface of the third-party library you call does not return a promise object, if you want to use Async/await, you can only create a new promise object at each step. This is actually very troublesome to write. Of course, it would be very convenient if it could return a promise object.
The following is written using node core module event without promises, for your reference:
You can use Node8’s
util.promisify
, or Bluebird, etc. change the Node callback-style function into a Promise-style function, and then you can useasync/await
to write code.The code itself is still an asynchronous call, but the writing method looks synchronous. Therefore, you still need to pay attention to the process structure when writing, especially when writing loops. The code is too long, so I wrote a small example to illustrate