최근에 모아서 작성한 초간단하고 실용적인 HTML 파싱 클래스
풀어 주다: 2016-07-25 09:02:04
$xp = new xf_HtmlDom();
$xp->loadHtml('http://dealer.bitauto.com/100040078/cars.html');
$rows = $xp->find('dl/dd/a', 0)->innertext;
print_r($rows);
- $oldSetting = libxml_use_internal_errors( true );
- libxml_clear_errors();
- /**
- *
- * - -----------------------------------
- * |PHP5 Framework - 2011
- * |Web Site: www.iblue.cc
- * |E-mail: mejinke@gmail.com
- * |Date: 2012-10-12
- * - -----------------------------------
- *
- * @desc HTML解析器
- * @author jingke
- */
- class XF_HtmlDom
- {
- private $_xpath = null;
- private $_nodePath = '';
- public function __construct($xpath = null, $nodePath = '')
- {
- $this->_xpath = $xpath;
- $this->_nodePath = $nodePath;
- }
-
- public function loadHtml($url)
- {
- ini_set('user_agent', 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');
- $content = '';
- if(strpos(strtolower($url), 'http')===false)
- {
- $content = file_get_contents($url);
- }
- else
- {
- $ch = curl_init();
- $user_agent = "Baiduspider ( http://www.baidu.com/search/spider.htm)";
- $user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_HEADER, false);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_REFERER, $url);
- curl_setopt($ch, CURLOPT_USERAGENT, $user_agent1);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
- $content =curl_exec($ch);
- curl_close($ch);
- }
-
- $html = new DOMDocument();
- $html->loadHtml($content);
- $this->_xpath = new DOMXPath( $html );
- return $this;
-
- }
-
- public function find($query, $index = null)
- {
- if($this->_nodePath == '')
- $this->_nodePath = '//';
- else
- $this->_nodePath .= '/';
-
- $nodes = $this->_xpath->query($this->_nodePath.$query);
- if ($index == null && !is_numeric($index))
- {
- $tmp = array();
- foreach ($nodes as $node)
- {
- $tmp[] = new XF_HtmlDom($this->_xpath, $node->getNodePath());
- }
- return $tmp;
- }
- return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());
- }
-
- /**
- * 获取内容
- */
- public function text()
- {
- if ($this->_nodePath != '' && $this->_xpath != null )
- return $this->_xpath->query($this->_nodePath)->item(0)->textContent;
- else
- return false;
- }
-
- /**
- * 获取属性值
- */
- public function getAttribute($name)
- {
- if ($this->_nodePath != '' && $this->_xpath != null )
- return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);
- else
- return false;
- }
-
- public function __get($name)
- {
- if($name == 'innertext')
- return $this->text();
- else
- return $this->getAttribute($name);
- }
-
- }
复制代码
|
본 웹사이트의 성명
본 글의 내용은 네티즌들의 자발적인 기여로 작성되었으며, 저작권은 원저작자에게 있습니다. 본 사이트는 이에 상응하는 법적 책임을 지지 않습니다. 표절이나 침해가 의심되는 콘텐츠를 발견한 경우 admin@php.cn으로 문의하세요.
저자별 최신 기사
-
2024-10-22 09:46:29
-
2024-10-13 13:53:41
-
2024-10-12 12:15:51
-
2024-10-11 22:47:31
-
2024-10-11 19:36:51
-
2024-10-11 15:50:41
-
2024-10-11 15:07:41
-
2024-10-11 14:21:21
-
2024-10-11 12:59:11
-
2024-10-11 12:17:31