PHP code for remotely grabbing website images and saving them-PHP Tutorial-php.cn

PHP code for remotely grabbing website images and saving them

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB

Release： 2016-07-25 09:12:59

Original

1093 people have browsed it

Example, PHP code to capture website data.

/**
* A class for grabbing images
*
* @package default
* @author WuJunwei
*/
class download_image
{
public $save_path; //The save address of the captured image
//The size limit of the captured image (unit: Bytes) Only capture images larger than size than this limit
public $img_size=0;
//Define a static array to record the hyperlink addresses that have been crawled to avoid repeated crawling
public static $ a_url_arr=array();
/**
* @param String $save_path The save address of the captured image
* @param Int $img_size The save address of the captured image
*/
public function __construct($save_path,$img_size)
{
$this->save_path=$save_path;
$this->img_size=$img_size ;
}
/**
* Method of recursively downloading and capturing images of the homepage and its subpages (recursive)
*
* @param String $capture_url URL used to capture images
*
*/
public function recursive_download_images($capture_url)
{
if (!in_array($capture_url,self::$a_url_arr)) //Not captured
{
self: :$a_url_arr[]=$capture_url; //Counted into static array
} else //After capture, exit the function directly
{
return;
}
$this->download_current_page_images($capture_url); //Download All pictures on the current page
//Use @ to block warning errors caused by the inability to read the capture address
$content=@file_get_contents($capture_url);
//Match the regular pattern before ? in the href attribute of the a tag
$a_pattern = "|]+href=['" ]?([^ '"?]+)['" >]|U";
preg_match_all($a_pattern, $content, $a_out, PREG_SET_ORDER);
$tmp_arr=array(); //Define an array to store the hyperlink address of the image captured under the current loop
foreach ($a_out as $k => $v)
{
/**
* Remove empty '', '#', '/' and duplicate values in hyperlinks
* 1: The value of the hyperlink address cannot be equal to the url of the current crawled page, otherwise it will fall into an infinite loop
* 2: Hyperlink is '' or '#', '/' is also this page, which will also fall into an infinite loop,
* 3: Sometimes a hyperlink address will appear multiple times in a web page. If it is not removed, it will cause damage to a sub-page. for repeated downloads)
*/
if ( $v[1] && !in_array($v[1],self::$a_url_arr) &&!in_array($v[1],array('#',' /',$capture_url) ) )
{
$tmp_arr[]=$v[1];
}
}
foreach ($tmp_arr as $k => $v)
{
//Hyperlink path address
if ( strpos($v, 'http://')!==false ) //If the url contains http://, you can access it directly
{
$a_url = $v;
}else //Otherwise the proof is Relative address, the access address of the hyperlink needs to be reassembled
{
$domain_url = substr($capture_url, 0,strpos($capture_url, '/',8)+1);
$a_url=$domain_url.$v;
}
$this->recursive_download_images($a_url);
}
}
/**
* Download all images under the current webpage
*
* @param String $capture_url The webpage address used to capture images
* @return Array An array of the url addresses of the img tags of all images on the current webpage
*/
public function download_current_page_images($capture_url)
{
$content=@file_get_contents($capture_url); / /Shield warning errors
// Match the regular part before ? in the src attribute of the img tag
$img_pattern = "|]+src=['" ]?([^ '"?]+) ['" > ;'.$capture_url . "Total found" . $photo_num . " pictures";
foreach ($img_out as $k => $v)
{
$this->save_one_img($capture_url ,$v[1]);
}
}
/**
* Method to save a single image
*
* @param String $capture_url The webpage address used to capture the image
* @param String $img_url The url of the image that needs to be saved
*
*/
public function save_one_img($capture_url,$img_url)
{
//Picture path address
if ( strpos($img_url, 'http://')!==false )
{
// $img_url = $img_url;
}else
{
$domain_url = substr($capture_url, 0,strpos($capture_url, '/',8)+1);
$img_url=$domain_url.$img_url ;
}
$pathinfo = pathinfo($img_url); //Get the picture path information
$pic_name=$pathinfo['basename']; //Get the name of the picture
if (file_exists($this->save_path.$ pic_name)) //If the image exists, it proves that it has been captured, exit the function
{
echo $img_url . 'The image has been captured !
';
return;
}
//Read the image content into a string
$img_data = @file_get_contents($img_url); //Block because the image address cannot be read Get the warning error caused by
if ( strlen($img_data) > $this->img_size ) //Download pictures whose size is larger than the limit
{
$img_size = file_put_contents($this->save_path . $pic_name, $ img_data);
if ($img_size)
{
echo $img_url . 'Image saved successfully!
';
} else
{
echo $img_url . 'Failed to save image!
';
}
} else
{
echo $img_url . 'Image reading failed!
';
}
}
} // END
set_time_limit(120); //Set the maximum execution time of the script according to the situation
$download_img=new download_image('E:/images/',0); //Instantiate the download image object
$download_img->recursive_download_images('http://bbs.it-home.org/'); //Recursive capture image method
//$download_img->download_current_page_images($_POST['capture_url']); / /Method to only grab the current page pictures
?>

Copy code