Home > Backend Development > PHP Tutorial > PHP uses curl and regular expressions to crawl web page data example_PHP tutorial

PHP uses curl and regular expressions to crawl web page data example_PHP tutorial

WBOY
Release: 2016-07-13 10:32:49
Original
873 people have browsed it

Using curl and regular expressions to build a novel grabber for non-VIP chapters of the Motie Chinese website. It supports entering the novel ID to download the novel.
Dependencies: curl
You can take a brief look at it. Curl, regular expressions, ajax and other technologies are used in it, which is suitable for novices. When testing locally, you must ensure that you are connected to the Internet and make sure that PHP turns on the curl mode

SpiderTools.class.php

Copy code The code is as follows:

session_start();
//封装成类 开启这些自动抓取文章
#header("Refresh:30;http://www.test.com:8080");
class SpiderTools{
//////////////////////////////////////////////////////////////////////////////////////////////////////////
/*传入文章ID 解析出文章标题*/
//////////////////////////////////////////////////////////////////////////////////////////////////////////
public function getBookNameById($aid){
//初始化curl
$ch= curl_init();
//url
$url='http://www.motie.com/book/'.$aid;
if(is_numeric($aid)){
//正则表达式匹配
$ru="/\s*(.*)\s*<\/a>\s*<\/h1>/";
  }
  else{
  //丧尸爆发之全家求生路_第一章  丧尸爆发  为吾友爱乐儿更新~_磨铁
  $ru="/(.*)<\/title>/";<br>  }<br>  //设置选项,包括URL<br>  curl_setopt($ch, CURLOPT_URL, $url);<br>  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//不自动输出内容<br>  curl_setopt($ch, CURLOPT_HEADER, 0);//不返回头部信息<br>  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <br>  //执行curl<br>  $output = curl_exec($ch);<br>  //错误提示<br>  if(curl_exec($ch) === false){<br>   die(curl_error($ch));<br>  }<br>  // 检查是否有错误发生<br>  if(curl_errno($ch)){<br>  echo 'Curl error: ' . curl_error($ch);<br>  }<br>  //释放curl句柄<br>  curl_close($ch);<br>  $arr=array();<br>  preg_match_all($ru,$output,$arr);<br>  return $arr[1][0];<br>   }<br> //////////////////////////////////////////////////////////////////////////////////////////////////////////     <br> /*传入文章ID  解析文章内容*/<br> //////////////////////////////////////////////////////////////////////////////////////////////////////////<br> public  function getBookContextById($aid){<br>  //开始解析文章<br>  $ids=array();<br>  $ids=explode("_",$aid);<br>  $titleId=trim($ids[0]);<br>  $aticleId=trim($ids[1]);<br>  $ch= curl_init();<br>  $ru="/<div class=\"page-content\">[\s\S]*<pre ondragstart=\"return false\" oncopy=\"return false;\" oncut=\"return false;\" oncontextmenu=\"return false\" class=\"note\" id=\"html_content_\d*\">[\s\S]*(.*)<img src=\"\/ajax\/chapter\/$titleId\/$aticleId\" class=\"hidden\" \/><\/pre>/ui"; <br>  $url='http://www.motie.com/book/'.$aid;<br>  //正则表达式匹配 <p>  //设置选项,包括URL<br>  curl_setopt($ch, CURLOPT_URL, $url);<br>  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//不自动输出内容<br>  curl_setopt($ch, CURLOPT_HEADER, 0);//不返回头部信息<br>  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <br>  //执行curl<br>  $output = curl_exec($ch);<br>  //错误提示<br>  if(curl_exec($ch) === false){<br>   die(curl_error($ch));<br>  }<br>  // 检查是否有错误发生<br>  if(curl_errno($ch)){<br>   echo 'Curl error: ' . curl_error($ch);<br>  }<br>  $arr=array();<br>  $arr2=array();<br>  preg_match_all($ru,$output,$arr);<br>  curl_close($ch);<br>  #var_dump($arr);<br>  $s=$arr[0][0];<br>  $s=substr($s,180);<br>  $arr2=explode("<img",$s);<BR> return trim($arr2[0]);<BR> }</P> <P> ////////////////////////////////////////////////////////////////////////////////////////////////////////// <BR> /*静态方法 @生成小说文件 可以直接调用 */<BR> //////////////////////////////////////////////////////////////////////////////////////////////////////////<BR> public static function createBookById($id){<br><br> if(!is_numeric($id)){</P><P> echo "<br/>INIT BEGIN START WRITE!";<br> $st=new self();<br> $cons=$st->getBookContextById($id);<br> $ title=$st->getBookNameById($id);<br> $cons=trim($cons);<br> $t=explode(" ",$title);<br> //Construct directory<br> $dir=array();<br> $dir=explode("_",$t[0]);<br> $wzdir=$dir[0]; //Book name as directory name<br> $wzchapter =$dir[1]; //Chapter <br> //Create directory <br> $wzdir2=iconv("UTF-8", "GBK", $wzdir); //Please note that the directory encoding is reserved here for $ The reference to the wzdir string is used to construct the file name and cannot be used here to prevent secondary encoding<br> if(!file_exists($wzdir2)){<br> mkdir($wzdir2); //Create directory<br> }<br> //Construct file name<br> $wztitle="./".$wzdir."/"."$t[0]".".txt";<br> //Guarantee saved file name Not garbled characters<br> $wztitle=iconv("UTF-8", "GBK", $wztitle);<br> $f=fopen($wztitle,"w+");<br> fwrite($f,$cons );<br> echo "<font color='green'>$wzdir </font>".$wzchapter."<font color='red'>Write successfully</font>"; <br> fclose($f); <br><br> } <br> else{<br> $ids=self::getBookIdsById($id); <br><br> //The server here may be offline , so it is best to use session recording loop <br> #for($i=$_SESSION["$id"."_fid"];$i<=count($ids);$_SESSION["$id"."_fid "]++,$i++){<br><br> #self::createBookById($id."_".$ids[$_SESSION["$id"."_fid"]++]);// Construct id<BR> #}<br><br> for($i=$_SESSION["$id"."_fid"];$i<=count($ids);$_SESSION["$id"." _fid"]++,$i++){<br><br> self::createBookById($id."_".$ids[$i]);//Construct id<BR> }<br><br> #echo "<hr/><hr/><br/><h1>The writing work is completed</h1>";<br> #echo $id."_".$ ids[0]."<br/>"; <br> #var_dump($ids);<br><br> }</p> <p><br> }<br> /*<br> Get all the IDs of the novel<br> @param $id article ID<br> @return array;<br> */<br> public static function getBookIdsById($ aid){ <br> $ch= curl_init();<br> $url='http://www.motie.com/book/'.$aid."/chapter";<br> //Note here ?The minimum matching items can be obtained<br> $ru='/[sS]*?<li class="" createdate="d{4}-d{2}-d{2} d{2}:d{ 2}:d{2}">[sS]*?<a href="/book/'.$aid.'_(d*?)"s{1}>.*?</a> ;.*?/u';//Regular expression matching<BR> //Set options, including URL<BR> curl_setopt($ch, CURLOPT_URL, $url);<BR> curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1) ;//Do not automatically output content<BR> curl_setopt($ch, CURLOPT_HEADER, 0);//Do not return header information<BR> curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0); <BR> //Execute curl<BR> $output = curl_exec($ch);<BR> // Check if an error occurs<BR> if(curl_errno($ch)){<BR> echo 'Curl error: ' . curl_error($ch);<BR> }<BR> //Release curl handle<BR> curl_close($ch);<BR> $arr=array();<BR> preg_match_all($ru,$output,$arr,PREG_PATTERN_ORDER);<BR> return $ arr[1];<BR> }<BR>}<BR>?><br></p> </div> <p>getinfo.php<br></p> <div class="codetitle"> <span style="CURSOR: pointer" onclick="doCopy('code20928')"><u>Copy code</u></span> The code is as follows:</div> <div class="code" id="code20928"> <br><?php<br> session_start();<br>require_once("SpiderTools.class.php");<br>if($_REQUEST["bid"]){<br>if(is_numeric($_REQUEST["bid"])){<br> SpiderTools:: createBookById(trim($_REQUEST["bid"]));<br> }<br> else{<br> echo "<br/>Please enter the correct article ID<br/>";<br> }<br>}<br>?><br> </div> <p>index.html<br></p> <div class="codetitle"> <span style="CURSOR: pointer" onclick="doCopy('code98901')"><u>Copy code</u></span> The code is as follows:</div> <div class="code" id="code98901"> <br><html><br><head><meta charset="utf-8"/></head><br><title>Download the novel

Enter the ID number of the novel you want to see on the Motie Chinese website to download the novel















http://www.bkjia.com/PHPjc/754038.html

www.bkjia.com

http: //www.bkjia.com/PHPjc/754038.htmlTechArticleUsing curl and regular expressions to make a novel grabber for non-VIP chapters of the Motie Chinese website, supported Enter the novel ID to download the novel. Dependencies: curl You can take a brief look at it, it uses...
Related labels:
source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Popular Tutorials
More>
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template