1. Foreword
It is said that it is a preliminary exploration of crawlers. In fact, it does not use third-party libraries related to crawlers. It mainly uses the node.js basic module http and the web page analysis tool cherrio. Use http to directly obtain the web page resource corresponding to the url path, and then use cherrio to analyze it. Here I have typed the main cases I have studied to deepen my understanding. During the coding process, for the first time, I directly traversed the object obtained by jq using forEach and reported an error directly. This was because jq did not have a corresponding method and only js arrays could be called.
2. Knowledge points
①: Superagent is a tool for grabbing web pages. I haven't used it yet.
②: cherrio web analysis tool, you can understand it as jQuery on the server side, because the syntax is the same.
Rendering
1. Capture the entire webpage
2. Analyzed data, The examples provided are examples of case implementation.
Initial source code analysis of crawler
var http=require('http'); var cheerio=require('cheerio'); var url='http://www.imooc.com/learn/348'; /**************************** 打印得到的数据结构 [{ chapterTitle:'', videos:[{ title:'', id:'' }] }] ********************************/ function printCourseInfo(courseData){ courseData.forEach(function(item){ var chapterTitle=item.chapterTitle; console.log(chapterTitle+'\n'); item.videos.forEach(function(video){ console.log(' 【'+video.id+'】'+video.title+'\n'); }) }); } /************* 分析从网页里抓取到的数据 **************/ function filterChapter(html){ var courseData=[]; var $=cheerio.load(html); var chapters=$('.chapter'); chapters.each(function(item){ var chapter=$(this); var chapterTitle=chapter.find('strong').text(); //找到章节标题 var videos=chapter.find('.video').children('li'); var chapterData={ chapterTitle:chapterTitle, videos:[] }; videos.each(function(item){ var video=$(this).find('.studyvideo'); var title=video.text(); var id=video.attr('href').split('/video')[1]; chapterData.videos.push({ title:title, id:id }) }) courseData.push(chapterData); }); return courseData; } http.get(url,function(res){ var html=''; res.on('data',function(data){ html+=data; }) res.on('end',function(){ var courseData=filterChapter(html); printCourseInfo(courseData); }) }).on('error',function(){ console.log('获取课程数据出错'); })
Reference:
https://github.com/alsotang/node-lessons/tree/master/lesson3