Example, PHP code to capture website data.
-
- /**
- * A class for grabbing images
- *
- * @package default
- * @author WuJunwei
- */
- class download_image
- {
-
- public $save_path; //The save address of the captured image
-
- //The size limit of the captured image (unit: Bytes) Only capture images larger than size than this limit
- public $img_size=0;
-
- //Define a static array to record the hyperlink addresses that have been crawled to avoid repeated crawling
- public static $ a_url_arr=array();
-
- /**
- * @param String $save_path The save address of the captured image
- * @param Int $img_size The save address of the captured image
- */
- public function __construct($save_path,$img_size)
- {
- $this->save_path=$save_path;
- $this->img_size=$img_size ;
- }
-
- /**
- * Method of recursively downloading and capturing images of the homepage and its subpages (recursive)
- *
- * @param String $capture_url URL used to capture images
- *
- */
- public function recursive_download_images($capture_url)
- {
- if (!in_array($capture_url,self::$a_url_arr)) //Not captured
- {
- self: :$a_url_arr[]=$capture_url; //Counted into static array
- } else //After capture, exit the function directly
- {
- return;
- }
-
- $this->download_current_page_images($capture_url); //Download All pictures on the current page
-
- //Use @ to block warning errors caused by the inability to read the capture address
- $content=@file_get_contents($capture_url);
-
- //Match the regular pattern before ? in the href attribute of the a tag
- $a_pattern = "|]+href=['" ]?([^ '"?]+)['" >]|U";
- preg_match_all($a_pattern, $content, $a_out, PREG_SET_ORDER);
-
- $tmp_arr=array(); //Define an array to store the hyperlink address of the image captured under the current loop
- foreach ($a_out as $k => $v)
- {
- /**
- * Remove empty '', '#', '/' and duplicate values in hyperlinks
- * 1: The value of the hyperlink address cannot be equal to the url of the current crawled page, otherwise it will fall into an infinite loop
- * 2: Hyperlink is '' or '#', '/' is also this page, which will also fall into an infinite loop,
- * 3: Sometimes a hyperlink address will appear multiple times in a web page. If it is not removed, it will cause damage to a sub-page. for repeated downloads)
- */
- if ( $v[1] && !in_array($v[1],self::$a_url_arr) &&!in_array($v[1],array('#',' /',$capture_url) ) )
- {
- $tmp_arr[]=$v[1];
- }
- }
-
- foreach ($tmp_arr as $k => $v)
- {
- //Hyperlink path address
- if ( strpos($v, 'http://')!==false ) //If the url contains http://, you can access it directly
- {
- $a_url = $v;
- }else //Otherwise the proof is Relative address, the access address of the hyperlink needs to be reassembled
- {
- $domain_url = substr($capture_url, 0,strpos($capture_url, '/',8)+1);
- $a_url=$domain_url.$v;
- }
-
- $this->recursive_download_images($a_url);
-
- }
- }
- /**
- * Download all images under the current webpage
- *
- * @param String $capture_url The webpage address used to capture images
- * @return Array An array of the url addresses of the img tags of all images on the current webpage
- */
- public function download_current_page_images($capture_url)
- {
- $content=@file_get_contents($capture_url); / /Shield warning errors
-
- // Match the regular part before ? in the src attribute of the img tag
- $img_pattern = "|]+src=['" ]?([^ '"?]+) ['" > ;'.$capture_url . "Total found" . $photo_num . " pictures";
- foreach ($img_out as $k => $v)
- {
- $this->save_one_img($capture_url ,$v[1]);
- }
- }
-
- /**
- * Method to save a single image
- *
- * @param String $capture_url The webpage address used to capture the image
- * @param String $img_url The url of the image that needs to be saved
- *
- */
- public function save_one_img($capture_url,$img_url)
- {
- //Picture path address
- if ( strpos($img_url, 'http://')!==false )
- {
- // $img_url = $img_url;
- }else
- {
- $domain_url = substr($capture_url, 0,strpos($capture_url, '/',8)+1);
- $img_url=$domain_url.$img_url ;
- }
- $pathinfo = pathinfo($img_url); //Get the picture path information
- $pic_name=$pathinfo['basename']; //Get the name of the picture
- if (file_exists($this->save_path.$ pic_name)) //If the image exists, it proves that it has been captured, exit the function
- {
- echo $img_url . 'The image has been captured !
';
- return;
- }
- //Read the image content into a string
- $img_data = @file_get_contents($img_url); //Block because the image address cannot be read Get the warning error caused by
- if ( strlen($img_data) > $this->img_size ) //Download pictures whose size is larger than the limit
- {
- $img_size = file_put_contents($this->save_path . $pic_name, $ img_data);
- if ($img_size)
- {
- echo $img_url . 'Image saved successfully!
';
- } else
- {
- echo $img_url . 'Failed to save image!
';
- }
- } else
- {
- echo $img_url . 'Image reading failed!
';
- }
- }
- } // END
-
- set_time_limit(120); //Set the maximum execution time of the script according to the situation
- $download_img=new download_image('E:/images/',0); //Instantiate the download image object
- $download_img->recursive_download_images('http://bbs.it-home.org/'); //Recursive capture image method
- //$download_img->download_current_page_images($_POST['capture_url']); / /Method to only grab the current page pictures
- ?>
Copy code
|