程式碼來源:jUnion
適用平台:Windows, Linux(Ubuntu),php-5.2.5 ,Apache
功能:抓取整個網站的圖片,暫無借助php的curl外掛程式開發, 後製完善
設定:config目錄下 domain_name:網域名稱(預設:bizhibar.com) request_site:網站網址(預設:http://www.bizhibar.com/) request_url:從網站的哪個頁面開始(預設:http://www.bizhibar.com/) accept_type: 圖片類型(預設:gif, bmp, png, ico, jpg, jpeg) save_path:圖片儲存路徑(預設:savefiles/) partition_name:圖片保存目錄名稱前綴(預設:img_) dir_file_limit: 每個目錄容許多少個檔案(預設:100) serialize_img_size: 當讀取了多少個圖片位址才會快取到cache目錄下的accompImg檔案當中,下次繼續抓取的時候會忽略這些位址。 (預設:30) serialize_url_size:與serialize_url_size一樣,已讀取多少個連結位址才快取到cache目錄 下的overURL,下次繼續抓取的時候忽略這些位址。 (預設:10)
說明:歡迎諸君批評指教,有任何新問題或需要改進的地方,請您回饋給我
-
set_time_limit(0);
- require dirname(__FILE__).DIRECTORY_SEPARATOR.'include'.DIRECTORY_SEPARATOR.'Capture; >require __Home__.'include'.__Os__.'Capture.class.php';
-
- $_cfg = array(
- 'site' => __Home__.'config'.__Os__.'capture.site. php',
- 'preg' => __Home__.'config'.__Os__.'capture.preg.php',
- 'accompImg' => __Home__.'cache'.__Os__.'accompImg',
- 'overURL' => __Home__.'cache'.__Os__.'overURL'
- );
-
- $_parse = new Capture( $_cfg );
- $_parse->parseQuestUrl();
- ?>
-
-
複製程式碼
-
/**
- * 主班
- * @author pankai
- * @date 2013-08-10
- */
- class Capture {
- private static $_Config = array();
-
- private static $_CapSite = NULL;
- private static $_CapPreg = NULL;
-
- private static $_overURL = array();
-
- private $. _markTime = 1;
- /**
- * 初始化主類別:Capture
- * @param $_cfg array
- */
- public function __construct( &$_cfg ) {
- self::$_Config = &$_cfg;
-
- self:: _CapSite = 需要$_cfg['site'];
- self::$_CapPreg = 需要$_cfg['preg'];
-
- foreach( self::$_CapPreg as $_key => $ _value ) {
- self::$_CapPreg[$_key] = str_replace( '_request_site', self::$_CapSite['request_site'], $_value );
- }
-
- self::import( 'file.OperateFile' );
- if( file_exists( $_cfg['overURL'] ) && filesize( $_cfg['overURL'] ) > 0 ) {
- $_contents = OperateFile::readText ( $_cfgread ['overURL'], filesize( $_cfg['overURL'] ) );
- self::$_overURL = unserialize( $_contents );
- }
-
- self::import ('pivotal::import ('pivotal::import ('pivotal::import) .Pivotal');
- if( file_exists( $_cfg['accompImg'] ) && filesize( $_cfg['accompImg'] ) > 0 ) {
- $_contents = OperateFilereadreade( $cfg::read; accompImg'], 檔案大小( $_cfg['accompImg'] ) );
- Pivotal::$_accompImg = unserialize( $_contents );
- }
-
- }
- /*/
- public static function import( $_class ) {
- require_once __Home__.'include'.__Os__.str_replace( '.' , __Os__, $_class ).'.class.php';
- ; }
-
- /**
- * 載入類,遵循Java pragrammer(package): import com.jUnion.Capture
- * @param $_class
- */
- private function getCapInstance( &$_source ) {
- $this- >_mark = FALSE;
-
- $_Captal self ::$_Config, $_source );
- $_tagA = $_Captal->parseUrl();
-
- $ this->_mark = TRUE;
-
- return $_tagA;
- }
-
- /**
- * 建立 Pivotal 類別的實例
- * @param $_source
- */
- 私有函數roundTagA( &$_tagArr ) {
- if( $_tagArr == NULL ) {
- return;
- }
- $_tagArrLength = count( $_tagArr );
- for( $i = 0; $i if( is_array( $_tagArr[ $i ] ) ) {
- $this->roundTagA( $_tagArr[ $i ] );
- }
- else {
- if( stripos( $_tagArr[$i], self::$_CapSite['domain_name'] )
- === FALSE ) {
- 繼續;
- }
- if( in_array( $_tagArr[$i], self::$_overURL ) ) {
- 繼續;
- }
- self::$_overURL[] = $_tagArr[$i] ;
- if( count( self::$_overURL ) % self::$_CapSite['serialize_url_size'] == 0 ) {
- OperateFile::setText( self::$_Config['overURL'], 序列化( self::$_overURL ) );
- }
- 做{
- $_tagA = $this->getCapInstance( Http::get( $_tagArr[$i] ) );
- sleep( self::$_CapSite['preform_page_time'] * self::$_markTime );
- if( $this->_mark === TRUE ) {
- self::$_markTime = self::$_CapSite[ ' preform_page_time'];
- break;
- }
- self::$_markTime *= 2;
- } while( true );
- /* 解析首頁並傳回下一頁*/
- $this->roundTagA( $_tagA );
- }
- }
- }
-
- public function parseQuestUrl() {
- self::import('http.Http ');
- $_round_Arr = $this->getCapInstance( Http::get( self::$_CapSite['request_url'] ) );
- $this->roundTagA( $_round_Arr );
- }
- }
- ?>
-
-
- 複製程式碼
-
|
本網站聲明
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn
作者最新文章
-
2024-10-22 09:46:29
-
2024-10-13 13:53:41
-
2024-10-12 12:15:51
-
2024-10-11 22:47:31
-
2024-10-11 19:36:51
-
2024-10-11 15:50:41
-
2024-10-11 15:07:41
-
2024-10-11 14:21:21
-
2024-10-11 12:59:11
-
2024-10-11 12:17:31