Home > php教程 > PHP源码 > body text

爬取一个themeForest的模版

PHP中文网
Release: 2016-05-26 08:19:10
Original
1596 people have browsed it

@version1.0
这个只能爬取静态的网站模版,代码逻辑需要优化。
正确的逻辑应该是从首页开始爬,
获取HTML,JS,IMAGE,CSS,然后从CSS中分析额外加载的CSS,最后分析所有的CSS中包含的图片引用。

1. [代码][PHP]代码

<?php
/**
 * 爬取 http://themeforest.net 
 */
class NetworkReptiles
{
    // 正则解析图片、JS、CSS、等资源文件
    protected $href_patten = "/<a href=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
    protected $script_pattern = "/<script src=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
    protected $image_pattern = "/<img src=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
 
    protected $link_pattern = "/<link href=[\&#39;\"]?([^\&#39;\" ]+).*?>/";
 
    // 样式名称
    private $theme_name = null;
    // 样式的地址
    private $theme_base_url = null;
 
    // 本地样式存储地址
    private $themeDir = null;
    // 当前解析的html数据
    private $current_data = null;
 
    public function __construct($name=false, $url=false){
        $this->theme_name = ($name) ? $name : false;
        $this->theme_base_url = ($url) ? $url : false;
    }
 
    public function setTheme($opt)
    {
        $this->theme_name = $opt[&#39;name&#39;];
        $this->theme_base_url = $opt[&#39;url&#39;];
    }
 
    public function getTheme()
    {
        set_time_limit(0); 
        // 从首页开始爬                                       
        $this->themeDir = getcwd().&#39;/theme/&#39;.$this->theme_name."/";
        // 创建目录
        if (! file_exists($this->themeDir)) {
            @mkdir($this->themeDir, 0755);
        }
                     
        //$this->getStyleImages();
         
        // 获取首页
        $this->current_data = $this->getHtmlData("index.html");
        // 下载所有的HTML
        $this->getHtml();
 
        // 下载首页的资源文件
        $this->showMsg("index.html");
        $this->downloadResource();
 
        foreach ($this->_html_resource as $key => $html) {
            $this->showMsg($html);
            // 下载每一个html文件的资源
            unset($this->current_data);
            $this->current_data = $this->getHtmlData($html);
            $this->downloadResource();
        }       
    }
 
    private function showMsg($html)
    {
        echo "download resource $html\n";
        echo str_repeat("-", 30)."\n";
    }
 
    private function getHtmlData($file_name)
    {
        $data = false;
        $file_path = $this->themeDir.$file_name;
        if (file_exists($file_path)) {
            $data = file_get_contents($file_path);
        } else {
            $data = file_get_contents($this->theme_base_url.$file_name);
            file_put_contents($file_path, $data);           
        }
        return $data;
    }
 
    private function getHtml()
    {       
        preg_match_all($this->href_patten, $this->current_data, $href_match);         
        foreach ($href_match[1] as $key => $value) {
            if (preg_match("/^(.*)?\.(html)$/", $value)) {
                array_push($this->_html_resource, $value);               
                $file_path = $this->themeDir.$value;
                if (! file_exists($file_path)) {                    
                    $this->downloadFile($value);                                 
                }               
            }
        }
    }
 
    private function downloadResource()
    {
        // 下载JS脚本
        preg_match_all($this->script_pattern, $this->current_data, $script_match);    
 
        foreach ($script_match[1] as $key => $value) {
            $this->createDirectory($value);
            $this->downloadFile($value);             
        }
 
        // 下载图片
        preg_match_all($this->image_pattern, $this->current_data, $image_match);  
 
        foreach ($image_match[1] as $key => $value) {
            $this->createDirectory($value);
            $this->downloadFile($value);         
        }
 
        // 下载CSS样式
        preg_match_all($this->link_pattern, $this->current_data, $link_match);    
 
        foreach ($link_match[1] as $key => $value) {
            $this->createDirectory($value);
            $this->downloadFile($value);         
        }
 
    }
 
    private function downloadFile($filename)
    {       
        $file_location = $this->themeDir.$filename;
        if (file_exists($this->themeDir.$filename))
        {
            echo "file already download $file_location\n";
            return;
        }
 
        $curl = curl_init($this->theme_base_url.$filename);      
        curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
        $resourceData = curl_exec($curl);
        curl_close($curl);      
         
        $fh = fopen($file_location, &#39;a&#39;);
        if (is_resource($fh))
        {
            fwrite($fh, $resourceData);
            fclose($fh);
            echo "download file ".$file_location."\n";
        }               
        echo "fail download file ".$file_location."\n";
    }
 
    private function createDirectory($str)
    {   
        $str = substr($str, 0, strrpos($str, "/"));     
        $dir = explode("/", $str);
        $tmp_dir = $this->themeDir;
        foreach ($dir as $key => $value) {       
            $tmp_dir = $tmp_dir."/".$value;
            if (!file_exists($tmp_dir)) {
                @mkdir($tmp_dir, 0755);
            }       
        }       
    }
    /*@todo 下载css中额外加载的CSS文件*/
    private function getStyleImages()
    {
        $style_path = array(
            "css/style.default.css",
            "css/prettyPhoto.css",  
            &#39;css/bootstrap.min.css&#39;,
            &#39;css/bootstrap-override.css&#39;,
            &#39;css/weather-icons.min.css&#39;,
            &#39;css/jquery-ui-1.10.3.css&#39;,
            &#39;css/font-awesome.min.css&#39;,
            &#39;css/animate.min.css&#39;,
            &#39;css/animate.delay.css&#39;,
            &#39;css/toggles.css&#39;,
            &#39;css/select2.css&#39;,
            &#39;css/lato.css&#39;,
            &#39;css/roboto.css&#39;
            );
 
        foreach ($style_path as $value) {                       
            $data = $this->getHtmlData($value);
 
            preg_match_all("/url[\(]?(.*)[\)]/", $data, $match);
            foreach ($match[1] as $image) {
                $realImagePath = substr($image, 3);
 
                if (preg_match("/^(.*)?\.(png)|(jpg)$/", $realImagePath))
                {
                    $this->createDirectory($realImagePath);
                    $this->downloadFile($realImagePath);
                }               
            }
            unset($data);
        }       
    }
 
}
 
//"bracket","http://themepixels.com/demo/webpage/bracket/"
 
$nr = new NetworkReptiles("bracket", "http://themepixels.com/demo/webpage/bracket/");
$nr->getTheme();
Copy after login

以上就是爬取一个themeForest的模版的内容,更多相关内容请关注PHP中文网(www.php.cn)!


Related labels:
source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Popular Recommendations
Popular Tutorials
More>
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template