Heim > php教程 > PHP源码 > 爬取一个themeForest的模版

爬取一个themeForest的模版

PHP中文网
Freigeben: 2016-05-26 08:19:10
Original
1628 Leute haben es durchsucht

@version1.0
这个只能爬取静态的网站模版,代码逻辑需要优化。
正确的逻辑应该是从首页开始爬,
获取HTML,JS,IMAGE,CSS,然后从CSS中分析额外加载的CSS,最后分析所有的CSS中包含的图片引用。

1. [代码][PHP]代码

<?php
/**
 * 爬取 http://themeforest.net 
 */
class NetworkReptiles
{
    // 正则解析图片、JS、CSS、等资源文件
    protected $href_patten = "/<a href=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
    protected $script_pattern = "/<script src=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
    protected $image_pattern = "/<img src=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
 
    protected $link_pattern = "/<link href=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
 
    // 样式名称
    private $theme_name = null;
    // 样式的地址
    private $theme_base_url = null;
 
    // 本地样式存储地址
    private $themeDir = null;
    // 当前解析的html数据
    private $current_data = null;
 
    public function __construct($name=false, $url=false){
        $this->theme_name = ($name) ? $name : false;
        $this->theme_base_url = ($url) ? $url : false;
    }
 
    public function setTheme($opt)
    {
        $this->theme_name = $opt[&#39;name&#39;];
        $this->theme_base_url = $opt[&#39;url&#39;];
    }
 
    public function getTheme()
    {
        set_time_limit(0); 
        // 从首页开始爬                                       
        $this->themeDir = getcwd().&#39;/theme/&#39;.$this->theme_name."/";
        // 创建目录
        if (! file_exists($this->themeDir)) {
            @mkdir($this->themeDir, 0755);
        }
                     
        //$this->getStyleImages();
         
        // 获取首页
        $this->current_data = $this->getHtmlData("index.html");
        // 下载所有的HTML
        $this->getHtml();
 
        // 下载首页的资源文件
        $this->showMsg("index.html");
        $this->downloadResource();
 
        foreach ($this->_html_resource as $key => $html) {
            $this->showMsg($html);
            // 下载每一个html文件的资源
            unset($this->current_data);
            $this->current_data = $this->getHtmlData($html);
            $this->downloadResource();
        }       
    }
 
    private function showMsg($html)
    {
        echo "download resource $html\n";
        echo str_repeat("-", 30)."\n";
    }
 
    private function getHtmlData($file_name)
    {
        $data = false;
        $file_path = $this->themeDir.$file_name;
        if (file_exists($file_path)) {
            $data = file_get_contents($file_path);
        } else {
            $data = file_get_contents($this->theme_base_url.$file_name);
            file_put_contents($file_path, $data);           
        }
        return $data;
    }
 
    private function getHtml()
    {       
        preg_match_all($this->href_patten, $this->current_data, $href_match);         
        foreach ($href_match[1] as $key => $value) {
            if (preg_match("/^(.*)?\.(html)$/", $value)) {
                array_push($this->_html_resource, $value);               
                $file_path = $this->themeDir.$value;
                if (! file_exists($file_path)) {                    
                    $this->downloadFile($value);                                 
                }               
            }
        }
    }
 
    private function downloadResource()
    {
        // 下载JS脚本
        preg_match_all($this->script_pattern, $this->current_data, $script_match);    
 
        foreach ($script_match[1] as $key => $value) {
            $this->createDirectory($value);
            $this->downloadFile($value);             
        }
 
        // 下载图片
        preg_match_all($this->image_pattern, $this->current_data, $image_match);  
 
        foreach ($image_match[1] as $key => $value) {
            $this->createDirectory($value);
            $this->downloadFile($value);         
        }
 
        // 下载CSS样式
        preg_match_all($this->link_pattern, $this->current_data, $link_match);    
 
        foreach ($link_match[1] as $key => $value) {
            $this->createDirectory($value);
            $this->downloadFile($value);         
        }
 
    }
 
    private function downloadFile($filename)
    {       
        $file_location = $this->themeDir.$filename;
        if (file_exists($this->themeDir.$filename))
        {
            echo "file already download $file_location\n";
            return;
        }
 
        $curl = curl_init($this->theme_base_url.$filename);      
        curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
        $resourceData = curl_exec($curl);
        curl_close($curl);      
         
        $fh = fopen($file_location, &#39;a&#39;);
        if (is_resource($fh))
        {
            fwrite($fh, $resourceData);
            fclose($fh);
            echo "download file ".$file_location."\n";
        }               
        echo "fail download file ".$file_location."\n";
    }
 
    private function createDirectory($str)
    {   
        $str = substr($str, 0, strrpos($str, "/"));     
        $dir = explode("/", $str);
        $tmp_dir = $this->themeDir;
        foreach ($dir as $key => $value) {       
            $tmp_dir = $tmp_dir."/".$value;
            if (!file_exists($tmp_dir)) {
                @mkdir($tmp_dir, 0755);
            }       
        }       
    }
    /*@todo 下载css中额外加载的CSS文件*/
    private function getStyleImages()
    {
        $style_path = array(
            "css/style.default.css",
            "css/prettyPhoto.css",  
            &#39;css/bootstrap.min.css&#39;,
            &#39;css/bootstrap-override.css&#39;,
            &#39;css/weather-icons.min.css&#39;,
            &#39;css/jquery-ui-1.10.3.css&#39;,
            &#39;css/font-awesome.min.css&#39;,
            &#39;css/animate.min.css&#39;,
            &#39;css/animate.delay.css&#39;,
            &#39;css/toggles.css&#39;,
            &#39;css/select2.css&#39;,
            &#39;css/lato.css&#39;,
            &#39;css/roboto.css&#39;
            );
 
        foreach ($style_path as $value) {                       
            $data = $this->getHtmlData($value);
 
            preg_match_all("/url[\(]?(.*)[\)]/", $data, $match);
            foreach ($match[1] as $image) {
                $realImagePath = substr($image, 3);
 
                if (preg_match("/^(.*)?\.(png)|(jpg)$/", $realImagePath))
                {
                    $this->createDirectory($realImagePath);
                    $this->downloadFile($realImagePath);
                }               
            }
            unset($data);
        }       
    }
 
}
 
//"bracket","http://themepixels.com/demo/webpage/bracket/"
 
$nr = new NetworkReptiles("bracket", "http://themepixels.com/demo/webpage/bracket/");
$nr->getTheme();
Nach dem Login kopieren

以上就是爬取一个themeForest的模版的内容,更多相关内容请关注PHP中文网(www.php.cn)!


Verwandte Etiketten:
Quelle:php.cn
Erklärung dieser Website
Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn
Beliebte Empfehlungen
Beliebte Tutorials
Mehr>
Neueste Downloads
Mehr>
Web-Effekte
Quellcode der Website
Website-Materialien
Frontend-Vorlage