Using curl and regular expressions, we built a novel grabber for non-VIP chapters of the Motie Chinese website. It supports inputting the novel ID to download the novel. Dependencies: curl You can take a brief look at it. Curl, regular expressions, ajax and other technologies are used in it, which is suitable for novices.When testing locally, you must ensure that you are connected to the Internet and make sure that PHP turns on curl mode.
- session_start();
- //Encapsulate into a class to start these automatically crawled articles
- #header("Refresh:30;http://www.test.com:8080");
- class SpiderTools{
- ///////////////////////////////////////////////// ////////////////////////////////////////////////////// //////////
- /*Input the article ID and parse out the article title*/
- ///////////////////////////// ////////////////////////////////////////////////////// //////////////////////////////
- public function getBookNameById($aid){
- //Initialize curl
- $ch= curl_init() ;
- //url
- $url='http://www.motie.com/book/'.$aid;
- if(is_numeric($aid)){
- //Regular expression matching
- $ru="/ s*(.*)s*s*/";
- }
- else {
- //The family’s survival in the zombie outbreak_The first chapter of the zombie outbreak is updated for my friend Ai Leer~_Sharpening Iron
- $ru="/(.*) /";
- }
- //Set options, including URL
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//Do not automatically output content
- curl_setopt( $ch, CURLOPT_HEADER, 0); //Does not return header information
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0);
- //Execute curl
- $output = curl_exec($ch);
- //Error message
- if(curl_exec ($ch) === false){
- die(curl_error($ch));
- }
- // Check if an error occurs
- if(curl_errno($ch)){
- echo 'Curl error: ' . curl_error( $ch);
- }
- //Release curl handle
- curl_close($ch);
- $arr=array();
- preg_match_all($ru,$output,$arr);
- return $arr[1][0] ;
- }
- //////////////////////////////////////////////// ////////////////////////////////////////////////////// //////////
- /*Input the article ID to parse the article content*/
- ///////////////////////////// ////////////////////////////////////////////////////// ///////////////////////////////
- public function getBookContextById($aid){
- //Start parsing the article
- $ids=array( );
- $ids=explode("_",$aid);
- $titleId=trim($ids[0]);
- $aticleId=trim($ids[1]);
- $ch= curl_init();
- $ru="/
[sS]* [sS]*(.*) pre>/ui";
- $url='http://www.motie.com/book/'.$aid;
- //Regular expression matching
-
- //Set options, including URL
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//Do not automatically output content
- curl_setopt($ch, CURLOPT_HEADER, 0);//Do not return header information
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0 );
- //Execute curl
- $output = curl_exec($ch);
- //Error message
- if(curl_exec($ch) === false){
- die(curl_error($ch));
- }
- / / Check if an error occurred
- if(curl_errno($ch)){
- echo 'Curl error: ' . curl_error($ch);
- }
- $arr=array();
- $arr2=array();
- preg_match_all ($ru,$output,$arr);
- curl_close($ch);
- #var_dump($arr);
- $s=$arr[0][0];
- $s=substr($s,180) ;
- $arr2=explode(" return trim($arr2[0]);
- }
- ////////////////// ////////////////////////////////////////////////////// /////////////////////////////////////////
- /*Static method@generated novel file can be called directly */
- ////////////////////////////////////////////////// ////////////////////////////////////////////////////// /////////
- public static function createBookById($id){
-
- if(!is_numeric($id)){
- echo "
INIT BEGIN START WRITE!";
- $ st=new self();
- $cons=$st->getBookContextById($id);
- $title=$st->getBookNameById($id);
- $cons=trim($cons);
- $t =explode(" ",$title);
- //Construct directory
- $dir=array();
- $dir=explode("_",$t[0]);
- $wzdir=$dir[0]; //Book name as directory name
- $wzchapter=$dir[1]; //Chapter
- //Create directory
- $wzdir2=iconv("UTF-8", "GBK", $wzdir); //Directory When encoding, please note that the reference to the $wzdir string is retained here, which is used to construct the file name. It cannot be used here to prevent secondary encoding
- if(!file_exists($wzdir2)){
- mkdir($wzdir2); //Create a directory
- }
- //Construct the file name
- $wztitle="./".$wzdir."/"."$t[0]".".txt";
- //Ensure that the saved file name is not garbled
- $wztitle=iconv ("UTF-8", "GBK", $wztitle);
- $f=fopen($wztitle,"w+");
- fwrite($f,$cons);
- echo "$wzdir ".$wzchapter."Write successfully";
- fclose($f);
-
- }
- else{
- $ ids=self::getBookIdsById($id);
-
- //The server may be offline here, so it is best to use session recording loop
- #for($i=$_SESSION["$id"."_fid"];$ i<=count($ids);$_SESSION["$id"."_fid"]++,$i++){
-
- #self::createBookById($id."_".$ids[$_SESSION[" $id"."_fid"]++]);//Construct id
- #}
-
- for($i=$_SESSION["$id"."_fid"];$i<=count($ids); $_SESSION["$id"."_fid"]++,$i++){
-
- self::createBookById($id."_".$ids[$i]);//Construct id
- }
-
- # echo "
The writing work is completed";
- #echo $id."_".$ids[0 ]."
";
- #var_dump($ids);
-
- }
-
- }
- /*
- Get all the IDs of the novel
- @param $id article ID
- @return array;
- */
- public static function getBookIdsById($aid){
- $ch= curl_init();
- $url='http://www.motie.com/book/'.$aid."/chapter";
- //Note here ?The minimum matching items can be obtained
- $ru='/[sS]*?
- [sS]*?.*?. *?/u';//Regular expression matching
- //Set options, including URL
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//Do not automatically output content
- curl_setopt($ch, CURLOPT_HEADER, 0); //Does not return header information
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 0);
- //Execute curl
- $output = curl_exec($ch);
- // Check if there are any errors Occurs
- if(curl_errno($ch)){
- echo 'Curl error: ' . curl_error($ch);
- }
- //Release curl handle
- curl_close($ch);
- $arr=array();
- preg_match_all ($ru,$output,$arr,PREG_PATTERN_ORDER);
- return $arr[1];
- }
- }
-
-
-
-
-
-
-
-
- ?>
Copy code
-
- session_start();
- require_once("SpiderTools.class.php");
- if($_REQUEST["bid"]){
- if(is_numeric($_REQUEST["bid"]) ){
- SpiderTools::createBookById(trim($_REQUEST["bid"]));
- }
- else{
- echo "
Please enter the correct article ID ";
- }
- }
- ?>
-
Copy code
|