Using curl and regular expressions to build a novel grabber for non-VIP chapters of the Motie Chinese website. It supports entering the novel ID to download the novel.
Dependencies: curl
You can take a brief look at it. Curl, regular expressions, ajax and other technologies are used in it, which is suitable for novices. When testing locally, you must ensure that you are connected to the Internet and make sure that PHP turns on the curl mode

SpiderTools.class.php

Copy code The code is as follows:

session_start();
//封装成类开启这些自动抓取文章
#header("Refresh:30;http://www.test.com:8080");
class SpiderTools{
//////////////////////////////////////////////////////////////////////////////////////////////////////////
/*传入文章ID 解析出文章标题*/
//////////////////////////////////////////////////////////////////////////////////////////////////////////
public function getBookNameById($aid){
//初始化curl
$ch= curl_init();
//url
$url='http://www.motie.com/book/'.$aid;
if(is_numeric($aid)){
//正则表达式匹配
$ru="/\s*(.*)\s*<\/a>\s*<\/h1>/";
 }
 else{
 //丧尸爆发之全家求生路_第一章丧尸爆发　为吾友爱乐儿更新~_磨铁
 $ru="/(.*)<\/title>/"; } //设置选项，包括URL curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//不自动输出内容 curl_setopt($ch, CURLOPT_HEADER, 0);//不返回头部信息 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0); //执行curl $output = curl_exec($ch); //错误提示 if(curl_exec($ch) === false){ die(curl_error($ch)); } // 检查是否有错误发生 if(curl_errno($ch)){ echo 'Curl error: ' . curl_error($ch); } //释放curl句柄 curl_close($ch); $arr=array(); preg_match_all($ru,$output,$arr); return $arr[1][0]; } ////////////////////////////////////////////////////////////////////////////////////////////////////////// /*传入文章ID 解析文章内容*/ ////////////////////////////////////////////////////////////////////////////////////////////////////////// public function getBookContextById($aid){ //开始解析文章 $ids=array(); $ids=explode("_",$aid); $titleId=trim($ids[0]); $aticleId=trim($ids[1]); $ch= curl_init(); $ru="/<div class=\"page-content\">[\s\S]*<pre ondragstart=\"return false\" oncopy=\"return false;\" oncut=\"return false;\" oncontextmenu=\"return false\" class=\"note\" id=\"html_content_\d*\">[\s\S]*(.*)<img src=\"\/ajax\/chapter\/$titleId\/$aticleId\" class=\"hidden\" \/><\/pre>/ui"; $url='http://www.motie.com/book/'.$aid; //正则表达式匹配 //设置选项，包括URL curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//不自动输出内容 curl_setopt($ch, CURLOPT_HEADER, 0);//不返回头部信息 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0); //执行curl $output = curl_exec($ch); //错误提示 if(curl_exec($ch) === false){ die(curl_error($ch)); } // 检查是否有错误发生 if(curl_errno($ch)){ echo 'Curl error: ' . curl_error($ch); } $arr=array(); $arr2=array(); preg_match_all($ru,$output,$arr); curl_close($ch); #var_dump($arr); $s=$arr[0][0]; $s=substr($s,180); $arr2=explode("<img",$s); return trim($arr2[0]); } ////////////////////////////////////////////////////////////////////////////////////////////////////////// /*静态方法 @生成小说文件可以直接调用 */ ////////////////////////////////////////////////////////////////////////////////////////////////////////// public static function createBookById($id){ if(!is_numeric($id)){ echo " INIT BEGIN START WRITE!"; $st=new self(); $cons=$st->getBookContextById($id); $ title=$st->getBookNameById($id); $cons=trim($cons); $t=explode(" ",$title); //Construct directory $dir=array(); $dir=explode("_",$t[0]); $wzdir=$dir[0]; //Book name as directory name $wzchapter =$dir[1]; //Chapter //Create directory $wzdir2=iconv("UTF-8", "GBK", $wzdir); //Please note that the directory encoding is reserved here for $ The reference to the wzdir string is used to construct the file name and cannot be used here to prevent secondary encoding if(!file_exists($wzdir2)){ mkdir($wzdir2); //Create directory } //Construct file name $wztitle="./".$wzdir."/"."$t[0]".".txt"; //Guarantee saved file name Not garbled characters $wztitle=iconv("UTF-8", "GBK", $wztitle); $f=fopen($wztitle,"w+"); fwrite($f,$cons ); echo "$wzdir ".$wzchapter."Write successfully"; fclose($f); } else{ $ids=self::getBookIdsById($id); //The server here may be offline , so it is best to use session recording loop #for($i=$_SESSION["$id"."_fid"];$i<=count($ids);$_SESSION["$id"."_fid "]++,$i++){ #self::createBookById($id."_".$ids[$_SESSION["$id"."_fid"]++]);// Construct id #} for($i=$_SESSION["$id"."_fid"];$i<=count($ids);$_SESSION["$id"." _fid"]++,$i++){ self::createBookById($id."_".$ids[$i]);//Construct id } #echo "<hr/><hr/> <h1>The writing work is completed</h1>"; #echo $id."_".$ ids[0]." "; #var_dump($ids); } } /* Get all the IDs of the novel @param $id article ID @return array; */ public static function getBookIdsById($ aid){ $ch= curl_init(); $url='http://www.motie.com/book/'.$aid."/chapter"; //Note here ?The minimum matching items can be obtained $ru='/[sS]*?<li class="" createdate="d{4}-d{2}-d{2} d{2}:d{ 2}:d{2}">[sS]*?<a href="/book/'.$aid.'_(d*?)"s{1}>.*?</a> ;.*?/u';//Regular expression matching //Set options, including URL curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1) ;//Do not automatically output content curl_setopt($ch, CURLOPT_HEADER, 0);//Do not return header information curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0); //Execute curl $output = curl_exec($ch); // Check if an error occurs if(curl_errno($ch)){ echo 'Curl error: ' . curl_error($ch); } //Release curl handle curl_close($ch); $arr=array(); preg_match_all($ru,$output,$arr,PREG_PATTERN_ORDER); return $ arr[1]; } } ?> </div> getinfo.php <div class="codetitle"> Copy code The code is as follows:</div> <div class="code" id="code20928"> <?php session_start(); require_once("SpiderTools.class.php"); if($_REQUEST["bid"]){ if(is_numeric($_REQUEST["bid"])){ SpiderTools:: createBookById(trim($_REQUEST["bid"])); } else{ echo " Please enter the correct article ID "; } } ?> </div> index.html <div class="codetitle"> Copy code The code is as follows:</div> <div class="code" id="code98901"> <html> <head><meta charset="utf-8"/></head> <title>Download the novel