基于Snoopy的PHP近似完美获取网站编码的代码

基于Snoopy的PHP近似完美获取网站编码的代码_php实例

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB

Freigeben： 2016-05-17 09:15:02

Original

1158 Leute haben es durchsucht

先要到网上下载Snoopy.class.php
调用方法：

 
require 'lib/Snoopy.class.php'; 
require 'lib/WebCrawl.class.php';//包含下面代码 
$go=new WebCrawl('http://www.baidu.com'); 
echo $go->getCharset(); 
?> 

复制代码代码如下:

 
class WebCrawl 
{ 
private $url; 
private $request; 
public $charset_arr=array( 
'gb2312', 
'utf-8', 
'big5', 
'gbk', 
'ascii', 
'cp936', 
'ibm037', 
'ibm437', 
'ibm500', 
'asmo-708', 
'dos-720', 
'ibm737', 
'ibm775', 
'ibm850', 
'ibm852', 
'ibm855', 
'ibm857', 
'ibm00858', 
'ibm861', 
'ibm860', 
'dos-862', 
'ibm863', 
'ibm864', 
'ibm865', 
'cp866', 
'ibm869', 
'ibm870', 
'windows-874', 
'cp875', 
'shift_jis', 
'ks_c_5601-1987', 
'ibm1026', 
'ibm01047', 
'ibm01047', 
'ibm01040', 
'ibm01041', 
'ibm01042', 
'ibm01043', 
'ibm01044', 
'ibm01045', 
'ibm01046', 
'ibm01047', 
'ibm01048', 
'ibm01049', 
'utf-16', 
'unicodefffe', 
'windows-1250', 
'windows-1251', 
'windows-1252', 
'windows-1253', 
'windows-1254', 
'windows-1255', 
'windows-1256', 
'windows-1257', 
'windows-1258', 
'johab', 
'macintosh', 
'x-mac-japanese', 
'x-mac-chinesetrad', 
'x-mac-korean', 
'x-mac-arabic', 
'x-mac-hebrew', 
'x-mac-greek', 
'x-mac-cyrillic', 
'x-mac-chinesesimp', 
'x-mac-romanian', 
'x-mac-ukrainian', 
'x-mac-thai', 
'x-mac-ce', 
'x-mac-icelandic', 
'x-mac-turkish', 
'x-mac-croatian', 
'x-chinese-cns', 
'x-cp20001', 
'x-chinese-eten', 
'x-cp20003', 
'x-cp20004', 
'x-cp20005', 
'x-ia5', 
'x-ia5-german', 
'x-ia5-swedish', 
'x-ia5-norwegian', 
'us-ascii', 
'x-cp20261', 
'x-cp20269', 
'ibm273', 
'ibm277', 
'ibm278', 
'ibm280', 
'ibm284', 
'ibm285', 
'ibm290', 
'ibm420', 
'ibm423', 
'ibm424', 
'x-ebcdic-koreanextended', 
'ibm-thai', 
'koi8-r', 
'ibm871', 
'ibm880', 
'ibm905', 
'ibm00924', 
'x-cp20936', 
'x-cp20949', 
'cp1025', 
'koi8-u', 
'iso-8859-1', 
'iso-8859-2', 
'iso-8859-3', 
'iso-8859-4', 
'iso-8859-5', 
'iso-8859-6', 
'iso-8859-7', 
'iso-8859-8', 
'iso-8859-9', 
'iso-8859-13', 
'iso-8859-15', 
'x-europa', 
'iso-8859-8-i', 
'iso-2022-jp', 
'csiso2022jp', 
'iso-2022-jp', 
'iso-2022-kr', 
'x-cp50227', 
'euc-jp', 
'euc-cn', 
'euc-kr', 
'hz-gb-2312', 
'gb18030', 
'x-iscii-de', 
'x-iscii-be', 
'x-iscii-ta', 
'x-iscii-te', 
'x-iscii-as', 
'x-iscii-or', 
'x-iscii-ka', 
'x-iscii-ma', 
'x-iscii-gu', 
'x-iscii-pa', 
'utf-7', 
'utf-32', 
'utf-32be' 
); 
public function __construct($url) 
{ 
$this->url=$url; 
} 
//打开网站 
private function open($url) 
{ 
if($this->request!==null) 
{ 
if($this->request->status==200) 
{ 
return true; 
} 
else 
{ 
return false; 
} 
} 
else 
{ 
$this->request=new Snoopy(); 
$this->request->fetch($url); 
if($this->request->status==200) 
{ 
$this->request->results=strtolower($this->request->results); 
$charset=$this->getCharset(); 
if($charset!="utf-8") 
{ 
if($charset=="windows-1252") 
{ 
$this->request->results=$this->uni_decode($this->request->results); 
} 
else 
{ 
$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); 
} 
} 
return true; 
} 
else 
{ 
return false; 
} 
} 
} 
//获取网站title,keywords,description 
public function getWebinfo() 
{ 
$info=array( 
'title'=>'', 
'keywords'=>'', 
'desc'=>'', 
'ip'=>'' 
); 
if(!$this->open($this->url)){return $info;exit;} 
// print_r($this->request->results);exit; 
preg_match('/([^>]*)/si', $this->request->results, $titlematch ); <br>if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) <br>{ <br>$info['title'] = strip_tags($titlematch[1]); <br>} <br>preg_match_all('/"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <br>$ft=0; <br>foreach($match[1] as $mt) <br>{ <br>if($mt=="keywords" || $mt=="description") <br>{ <br>$ft=1; <br>} <br>} <br>if($ft==0) <br>{ <br>preg_match_all('/"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match); <br>if (isset($match) && is_array($match) && count($match) == 3) <br>{ <br>$originals = $match[0]; <br>$names = $match[2]; <br>$values = $match[1]; <br>if (count($originals) == count($names) && count($names) == count($values)) <br>{ <br>$metaTags = array(); <br>for ($i=0, $limiti=count($names); $i { <br>$metaTags[$names[$i]] = array ( <br>'html' => htmlentities($originals[$i]), <br>'value' => $values[$i] <br>); <br>} <br>} <br>} <br>} <br>else <br>{ <br>if (isset($match) && is_array($match) && count($match) == 3) <br>{ <br>$originals = $match[0]; <br>$names = $match[1]; <br>$values = $match[2]; <br>if (count($originals) == count($names) && count($names) == count($values)) <br>{ <br>$metaTags = array(); <br>for ($i=0, $limiti=count($names); $i { <br>$metaTags[$names[$i]] = array ( <br>'html' => htmlentities($originals[$i]), <br>'value' => $values[$i] <br>); <br>} <br>} <br>} <br>} <br>$result = array ( <br>'metaTags' => $metaTags <br>); <br>if(isset($result['metaTags']['keywords']['value'])) <br>{ <br>$info['keywords']=$result['metaTags']['keywords']['value']; <br>} <br>else <br>{ <br>$info['keywords']=""; <br>} <br>if(isset($result['metaTags']['description']['value'])) <br>{ <br>$info['desc']=$result['metaTags']['description']['value']; <br>} <br>else <br>{ <br>$info['desc']=""; <br>} <br>$domain=preg_replace('/http\:\/\//si', '', $this->url); <br>$ip=@gethostbyname($domain); <br>$ip_arr=explode(".", $ip); <br>if(count($ip_arr)==4) <br>{ <br>$info['ip']=$ip; <br>} <br>return $info; <br>} <br>public function t($string,$o) <br>{ <br>for($i=0;$i<strlen>{ <br>if(ord($string{$i})continue; <br>if((ord($string{$i})&224)==224) <br>{ <br>//第一个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>//第二个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>$encoding = "UTF-8"; <br>break; <br>} <br>} <br>} <br>if((ord($string{$i})&192)==192) <br>{ <br>//第一个字节判断通过 <br>$char = $string{++$i}; <br>if((ord($char)&128)==128) <br>{ <br>//第二个字节判断通过 <br>$encoding = "GB2312"; <br>break; <br>} <br>} <br>} <br>return strtolower($encoding); <br>} <br>function uni_decode ($str, $code = 'utf-8'){ <br>$str = json_decode(preg_replace_callback('/(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"')); <br>if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); } <br>return $str; <br>} <br>//获取网站编码 <br>public function getCharset() <br>{ <br>if(!$this->open($this->url)){return false;exit;} <br>//首先从html获取编码 <br>preg_match("/<meta.>request->results,$temp) ? strtolower($temp[1]):""; <br>if($temp[1]!="") <br>{ <br>if(in_array($temp[1], $this->charset_arr)) <br>{ <br>if($temp[1]=="gb2312") <br>{ <br>$tmp_charset=$this->t($this->request->results,$temp[1]); <br>if($tmp_charset==$temp[1]) <br>{ <br>return $temp[1]; <br>} <br>} <br>else <br>{ <br>return $temp[1]; <br>} <br>} <br>} <br>if(!empty($this->request->headers)) <br>{ <br>//从header中获取编码 <br>$hstr=strtolower(implode("|||",$this->request->headers)); <br>preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; <br>if($lang[1]!="") <br>{ <br>return $lang[1]; <br>} <br>} <br>$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); <br>$encoded=mb_detect_encoding($this->request->results,$encode_arr); <br>if($encoded) <br>{ <br>return strtolower($encoded); <br>} <br>else <br>{ <br>return false; <br>} <br>} <br>} <br>?> <br></meta.></strlen>