node.js - Use node as a crawler to simulate the interface to obtain data in a loop. How can async/await achieve synchronous operations and control the process?
给我你的怀抱
给我你的怀抱 2017-07-06 10:36:31
0
4
1676

The current idea is as follows: 1. Get all tag page URLs under the category, 2. Loop through the crawl page to grab the current tag page to get the json api address, 3. Grab the product list of the current tag, 4. Grab Get the products being loaded in the current label's page.

But what we have done now is that at the beginning of the second step, we did not wait until 2-4 was completed before looping to the next one. We tried to use async/await, but the process control was not implemented. Asking for advice here.

var http = require('http');
var fs = require("fs");
var superagent = require('superagent');
var urls = [];
var pageIndex = 1;
var xlsxData = '';

getGoodsUrl(urls);

function getGoodsUrl(urls){
    superagent
        .post('http://bravetime.davdian.com/index.php?c=Index&a=getCatNavList')
        .type('text/html; charset=utf-8')
        .set('Accept','application/json, text/javascript, */*; q=0.01')
        .end(function(err, res) {
            if (err) {
                console.log('分类数据请求失败');
            } else {
                console.log('分类数据请求成功');
                var resData = res.text;
                var resData = JSON.parse(resData); 
                if(resData.data.length > 0){
                    resData.data.forEach(function(item){
                        var rowObj = [];
                        var title = item.title;
                        var category = item.content.category;
                         category.forEach(function(item){
                             var text = [];
                             text.push(title+ '--' + item.text);
                             text.push(item.link);
                            rowObj.push(text);
                         });
                         urls.push(rowObj);
                     });
                     loopUrls(urls);
                } else {
                    console.log('分类数据为空');
                }

                // saveInfo(xlsxData);
            }
        })
}

function loopUrls(urls){
    urls.forEach(function(item){
        var row = item;
        row.forEach(function(item){
            var tagTitie = item[0];
            var tegUrl = item[1];
            getApiUrl(tagTitie,tegUrl);
        });
    });
}

function getApiUrl(title,url){
    var realUrl = 'http://bravetime.davdian.com' + url;
    http.get(realUrl,function(res){
         var html = '';
         res.on('data',function(data){
             html += data;
         });
         res.on('end',function(){
             console.log('正在获取' + title + '页面数据');
             var reg = /goodsUrl = "(.+)"/;
             var apiUrl = reg.exec(html);
             getGoodsJson(apiUrl[1],pageIndex);
         });
     }).on('error',function(){
         console.log('获取html出错!!');
     });
}

function getGoodsJson(url,pageIndex){
    superagent
        .post('http://bravetime.davdian.com/' + url + 'page_size=10&rp=catergory_search&rl=list')
        .send({page:pageIndex})
        .type('application/x-www-form-urlencoded; charset=UTF-8')
        .set('Accept','application/json, text/javascript, */*; q=0.01')
        .end(function(err, res) {
            if (err) {
                console.log('第' + pageIndex + '页请求失败');
            } else {
                console.log('第' + pageIndex + '页请求成功');
                var resData = res.text;
                var resData = JSON.parse(resData); 
                if(resData.data.length > 0){
                    resData.data.forEach(function(item){
                         xlsxData = xlsxData + item.goods_name + '  ' + item.shop_price + '  ' + item.goods_number + '\r\n';
                     });
                     pageIndex = parseInt(pageIndex) + 1;
                     setTimeout(function(){
                         getGoodsJson(url,pageIndex);
                     },200);
                } else {
                    console.log('数据已加载完毕');
                    saveTxt(xlsxData);
                    pageIndex = 1;
                    return false;
                }

                // saveInfo(xlsxData);
            }
        })

}

function saveTxt(data){
    fs.writeFile("create.txt",data,function (err) {
        if (err) throw err ;
         console.log("File Saved !"); //文件被保存
    }) ;
}
function saveInfo(data){
     var buffer = xlsx.build([{name: "mySheetName", data: data}]);
     fs.writeFileSync("myFile.xlsx", buffer, 'binary');
     console.log('excel保存成功');
}

The following is the result diagram and code execution sequence:

给我你的怀抱
给我你的怀抱

reply all(4)
Peter_Zhu

generator
async
promise

仅有的幸福

Your entire process is asynchronous, and you don’t see any meaning of synchronization. I think you may not understand what asynchronous is.

Async/await is based on Promise, and Superagent itself supports Promise. You can use async/await directly.

async function() {
  try {
    const result = await superagent.get(url);
    console.log(result.headers);
    console.log(result.body);
  } catch (error) {
    console.error(error);
  }
}

http://visionmedia.github.io/...

http://www.ruanyifeng.com/blo...

Then all you need is to replace http.get() with superagent.get().

習慣沉默

Generally, people don’t have the patience to read other people’s business logic.

As mentioned above, Async/await is based on Promise. If the API interface of the third-party library you call does not return a promise object, if you want to use Async/await, you can only create a new promise object at each step. This is actually very troublesome to write. Of course, it would be very convenient if it could return a promise object.

The following is written using node core module event without promises, for your reference:

const EventEmitter = require('events');
class MyEmitter extends EventEmitter {}
const myEmitter = new MyEmitter();

myEmitter.on('step1', (m) => {
    //第一步
    //业务逻辑处理得到结果result1
        
    //判断是否触发下一步,如有需要将这步的结果传给下一步
    myEmitter.emit('step2', result1);
   
});
myEmitter.on('step2', (result1) => {
    //第二步
    //业务逻辑处理得到结果result2
        
    //判断是否触发下一步,如有需要将这步的结果传给下一步
    myEmitter.emit('step3', result2);
});
myEmitter.on('step3', (result2) => {
    //以此类推
});
myEmitter.emit('step1', urls);
刘奇

You can use Node8’s util.promisify , or Bluebird, etc. change the Node callback-style function into a Promise-style function, and then you can use async/await to write code.

The code itself is still an asynchronous call, but the writing method looks synchronous. Therefore, you still need to pay attention to the process structure when writing, especially when writing loops. The code is too long, so I wrote a small example to illustrate

async function remoteCall() {
    // do something
}

list = [];  // 假设是很多数据


async function process() {
    // 这种写法必须要一个 remoteCall 完成之后才进行另一个
    for (let i = 0; i < list.length; i++) {
        await remoteCall();
    }

    doAfter();
}

async function process2() {
    // 这种写法没法 await
    list.forEach(function(t) {
        remoteCall();
    });
}

async function process3() {
    // 这种写法 doAfter 一早就会执行
    list.forEach(async function(t) {
        await remoteCall();
    });

    // 它可能会在 remoteCall() 之前
    doAfter();
}

async function process4() {
    // 这种写法必须要全部 remoteCall 成功才能进行到 doAfter
    // remoteCall返回的 promise 如果 reject 会抛异常
    var promises = list.map(t => remoteCall());
    await Promise.all(promises);
    doAfter();
}
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template