充分利用正则的强大字串处理能力,使用简单,功能也比较简单,能满足一般应用,功能也在不断完善中,使用过程:设置一个初始url,添加导航规则,添加采集字段和规则,保存输出即可
<?php set_time_limit(0); header("Content-type: text/html; charset=utf-8"); /** * 采集程序类 * @author shooting * @version 1.0.0 */ class spider { /** * 采集的终端页地址 * * @var array */ var $pages = array(); /** * 采集结果 * * @var array */ var $result = array(); /** * 第一层链接页面 * * @var array */ var $startUrls = array(); /** * 超时时间 * * @var integer */ var $timeout; /** * 正在处理的文件内容 * * @var string */ var $httpContent; /** * 正在处理的文件头 * * @var array */ var $httpHead=array(); /** * 自定义的head数组 * * @var array */ var $putHead = array(); /** * 采集字段与规则数组 * * @var array */ var $field_arr = array(); /** * 采集层次数 * * @var interger */ var $deep; /** * 采集层次结构 * * @var array */ var $layout_arr = array(); /** * 采集限制条数 * * @var integer */ var $limit = 0; /** * 程序运行时间 * * @var float */ var $runtime = 0; /** * 被采集页面编码 * * @var string */ var $charset = 'UTF-8'; /** * 页面引用地址 * * @var string */ var $httpreferer; var $pagelimit = 0; var $filepath = './'; function spider() { $this->timeout = 30; } /** * 运行采集 * * @return array */ function run() { $begintime = $this->microtime_float(); $cnt = 1; foreach ($this->startUrls as $starturl){ /** * 解析出起始地址中的页码区间 */ if(preg_match("~\{(\d+),(\d+)\}~",$starturl,$pagenum)){ $pagebegin = intval($pagenum[1]); $pageend = intval($pagenum[2]); for(;$pagebegin<=$pageend;$pagebegin++){ $starturl = str_replace($pagenum[0],$pagebegin,$starturl); $urllists = $this->getLists($this->layout_arr[0]['pattern'],$this->getContent($starturl)); foreach ($urllists as $url){ if(($this->limit > 0 && $cnt <= $this->limit)||$this->limit == 0) { $this->filterContent($this->getContent($url,$starturl)); $cnt++; } } } }else{ $urllists = $this->getLists($this->layout_arr[0]['pattern'],$this->getContent($starturl)); foreach ($urllists as $url){ if(($this->limit > 0 && $cnt <= $this->limit)||$this->limit == 0) { $this->filterContent($this->getContent($url,$starturl)); $cnt++; } } } } $this->runtime = $this->microtime_float()-$begintime; return $this->result; } /** * 从文字段中根据规则提取出url列表 * * @param string $pattern * @param string $content * @return Array */ function getLists($pattern='',$content='') { if(strpos($pattern,'{*}') === false)return array($pattern); $pattern = preg_quote($pattern); $pattern = str_replace('\{\*\}','([^\'\">]*)',$pattern); $pattern = "~".$pattern."~is"; preg_match_all($pattern,$content,$preg_rs); return array_unique($preg_rs[0]); } /** * 获取指定url的html内容包括头 * * @param string $url * @return string */ function getContent($url,$referer = '') { $url = $this->urlRtoA($url,$referer); preg_match("/(http:\/\/)([^:\/]*):?(\d*)(\/?.*)/i",$url,$preg_rs); $host = $preg_rs[2]; $port = empty($preg_rs[3])?80:$preg_rs[3]; $innerUrl = $preg_rs[4]; $fsp = fsockopen($host,$port,$errno,$errstr,$this->timeout); if(!$fsp)$this->log($errstr.'('.$errno.')'); $output = "GET $url HTTP/1.0\r\nHost: $host\r\n"; if(!isset($this->putHead['Accept']))$this->putHead['Accept']= "*/*"; if(!isset($this->putHead['User-Agent']))$this->putHead['User-Agent']='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2)'; if(!isset($this->putHead['Refer'])){ $this->putHead['Refer'] = ($referer == '')?'http://'.$host:$referer; } foreach ($this->putHead as $headname => $headvalue){ $output .= trim($headname).': '.trim($headvalue)."\r\n"; } $output .= "Connection: close\r\n\r\n"; fwrite($fsp,$output); $content = ''; while (!feof($fsp)) { $content .= fgets($fsp,256); } fclose($fsp); $this->getHead($content); $this->httpContent = $content; if(strtoupper($this->charset) != 'UTF-8'){ $content = iconv($this->charset,'utf-8',$content); }else if(!empty($this->httpHead['charset']) && $this->httpHead['charset']!='UTF-8') { $content = iconv($this->httpHead['charset'],'utf-8',$content); } $this->httpreferer = $referer; return $content; } /** * 按照规则从内容提取所有字段 * @param Array * @return Array */ function filterContent($content='') { $rs = array(); foreach ($this->field_arr as $field => $fieldinfo){ $rs[$field] = $this->getPregField($fieldinfo,$content); } $this->result[] = $rs; } /** * 相对路径转化为绝对路径 * * @param string $relative * @param string $referer * @return string */ function urlRtoA($relative,$referer) { /** * 去除#后面的部分 */ $pos = strpos($relative,'#'); if($pos >0)$relative = substr($relative,0,$pos); /** * 检测路径如果是绝对地址直接返回 */ if(preg_match("~^(http|ftp)://~i",$relative)) return $relative; /** * 解析引用地址,获得协议,主机等信息 */ preg_match("~((http|ftp)://([^/]*)(.*/))([^/#]*)~i", $referer, $preg_rs); $parentdir = $preg_rs[1]; $petrol = $preg_rs[2].'://'; $host = $preg_rs[3]; /** * 如果以/开头的情况 */ if(preg_match("~^/~i",$relative)) return $petrol.$host.$relative; return $parentdir.$relative; } /** * 根据规则提取一个字段 * * @param string $pattern * @param string $content * @return string */ function getPregField($fieldinfo,$content) { /** * 规则为固定值的情况,直接返回固定值 */ if(strpos($fieldinfo['pattern'],'{'.$fieldinfo['field'].'}') === false) return $fieldinfo['pattern']; if($fieldinfo['isregular'] == 'true'){ $pattern = $fieldinfo['pattern']; $pattern = str_replace('{'.$fieldinfo['field'].'}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern); }else{ $pattern = preg_quote($fieldinfo['pattern']); $pattern = str_replace('\{'.$fieldinfo['field'].'\}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern); } $pattern = "~".$pattern."~is"; preg_match($pattern,$content,$preg_rs); $fieldresult = $preg_rs[$fieldinfo['field']]; /** * 去掉换行符 */ $fieldresult = preg_replace("~[\r\n]*~is",'',$fieldresult); /** * 对采集到的结果根据规则再进行二次替换处理 */ $replace_arr = $fieldinfo['replace']; if(is_array($replace_arr)){ $replace_arr[0] = "~".$replace_arr[0]."~s"; $fieldresult = preg_replace($replace_arr[0],$replace_arr[1],$fieldresult); } /** * 针对有下一页的字段递归采集 */ if($this->pagelimit == 0){ if($fieldinfo['nextpage'] != ''){ $pattern = $fieldinfo['nextpage']; $pattern = str_replace('{nextpage}','(?P<nextpage>[^\'\">]*?)',$pattern); $pattern = "~".$pattern."~is"; if(preg_match($pattern,$content,$preg_rs) && $preg_rs['nextpage'] != ''){ $fieldresult .= $this->getPregField($fieldinfo,$this->getContent($preg_rs['nextpage'],$this->httpreferer)); } } } if(!empty($fieldinfo['callback']))$fieldresult = $fieldinfo['callback']($fieldresult); return $fieldresult; } /** * 添加一个采集字段和规则 * * @param string $field * @param string $pattern */ function addField($field,$pattern,$replace_arr='',$isregular='false',$nextpage = '',$callback='') { $rs = array( 'field' => $field, 'pattern' => $pattern, 'replace' => $replace_arr, 'isregular' => $isregular, 'nextpage' => $nextpage, 'callback'=>$callback ); $this->field_arr[$field] =$rs; } /** * 输出 * */ function output() { echo "The result is:<br/>"; echo "runtime :$this->runtime S<br/><pre class="brush:php;toolbar:false">"; print_r($this->result); echo ""; } /** * 输出到XLS文件 * * @param string $file */ function saveXls($file = 'spider_result.xls') { $fp = fopen($file,'w'); if($fp){ foreach ($this->result as $result) { $line = implode("\t",$result)."\n"; fputs($fp,$line); } } fclose($fp); echo 'The result has been saved to '.$file.'.
{body}
以上就是采集的内容,更多相关内容请关注PHP中文网(www.php.cn)!