자세한 소개와 사용법을 보시려면 소스코드를 클릭해주세요.
- /**
- * htmlmap 파서
- *
- * @category TagParse
- * @package TagParse
- * @author this
- * @copyright 2014 this * @version 1.0
- * @link http://www.blogkun.com
- * @ 1.0
- 이후*/
- 네임스페이스 TagParse;
-
- /**
- * TagDomRoot
- *
- * @category TagParse
- * @package TagParse
- * @author kun
- * @copyright 2014 kun
- * @license http://www.php.net/license/3_01.txt PHP 라이센스 3.01
- * @version 1.0
- * @link http://www.blogkun.com
- * @since 1.0
- */
- class TagDomRoot
- {
- public $tag = 'root';
- public $plaintext;
- public $child = array();
- public $level = 0;
- public static $ TagParseError = false;
- protected static $TagSet = array();
- protected static $FoundNode = array();
- public static $ErrorTag = array();
-
- /**
- * initProperty
- *
- * @access public
- *
- * @return null
- */
- 공용 함수 initProperty()
- {
- $TagParseError = false;
- $TagSet = array();
- $FoundNode = array();
- $DumpScriptCode = array();
- $ErrorTag = array();
- }
-
- /**
- * __construct
- *
- * @param string $str 구문 분석할 태그 문자열.
- *
- * @access public
- *
- * @return TagDomRoot
- */
- 공개 함수 __construct($str)
- {
- $ this->_removeNoise($str);
- if ($str === null) {
- self::$TagParseError = true;
- } else {
- $l = strpos($str , '<');
- if ($l !== false) {
- $this->plaintext = substr($str, 0, $l);
- }
- $res = preg_match_all('~>(.*?)<~s', $str, $matches);
- if ($res !== false && $res > 0) {
- $this->plaintext .= implode($matches[1]);
- }
- $r = strrpos($str, '>');
- if ($ r !== false) {
- $this->plaintext .= substr($str, $r 1);
- }
-
- $tagCollect = array();
- $attrCollect = array();
- $innerContentCollect = array();
-
- if ($this->parseTag($str, $tagCollect, $attrCollect, $innerContentCollect) === false) {
- self::$TagParseError = true;
- }
-
- foreach ($tagCollect as $index => $tag) {
- $this->child[] = new TagDomNode($tag , $this, $attrCollect[$index], $innerContentCollect[$index], $this->level 1);
- }
- }
- }
-
- /**
- * parsTag
- *
- * @param 혼합 $str 설명.
- * @param 혼합 &$tagCollect 설명.
- * @param 혼합 &$attrCollect 설명.
- * @param 혼합 &$innerContentCollect 설명.
- *
- * @access protected
- *
- * @return 부울 값.
- */
- 보호된 함수 parseTag($str, 배열 &$tagCollect, 배열 &$attrCollect, 배열 &$innerContentCollect)
- {
- $selfClosingTags = array('img' => 1, 'br ' => 1, '입력' => 1, '메타' => 1, '링크' => 1, '시간' => 1, '베이스' => 1, '삽입' => 1, '스페이서' => 1);
- $end = -2;
- $close = 0;
- $error = false;
- $tag = '';
- while(true) {
- $l = strpos($str, '<', $end strlen($tag) 2);
- if ($l === false) {//parse end
- break;
- }
- if (strpos(substr($str, $l, 2), '/') !== false) {//잉여 닫는 태그, 버리기
- $error = true;
- $end = $l strlen( $tag);
- self::$ErrorTag[] = substr($str, $l, strpos($str, '>', $l)-$l 1);
- 계속;
- }
-
- $r = strpos($str, '>', $l);
- $tag = substr($str, $l 1, $r-$l-1);
- if (!ctype_alpha($tag[0]) || strpos($tag, '<') !== false) {
- $end = $r 1;
- 계속;
- }
- $tag = preg_replace("~n ~", ' ', $tag);
- $space = strpos($tag, ' ');
- if ($space !== false) {
- $attrCollect[] = substr($tag, $space 1);
- $tag = substr($tag, 0, $space);
- } else {
- $attrCollect[] = '' ;
- }
- $tagCollect[] = $tag;
- if (isset($selfClosingTags[$tag])) {
- $innerContentCollect[] = '';
- $end = $ r-strlen($tag)-2;
- $close = $r 1;
- 계속;
- }
- $countOpen = -1;
- $open = strpos($ str, '<'.$tag, $close);
- $close = strpos($str, ''.$tag.'>', $open);
- if ($close === false) {//여분 여는 태그
- $innerContentCollect[] = substr($str, $r 1);
- $error = true;
- self::$ErrorTag[] = '< ;'.$tag.'>';
- break;
- }
- $start = $open;
- while ($open < $close && $open !== false) {
- $countOpen ;
- $open = strpos($str, '<'.$tag, $open strlen($tag));
- }
- while ($countOpen > 0 && $close !== false) {
- $open = strpos($str, '<'.$tag, $close strlen($tag) 3);
- $close = strpos($str, ''.$tag.'>', $close strlen($tag) 3);
- if ($close === false) {
- break;
- }
- $countOpen--;
- while ($open < $close && $open !== false) {
- $open = strpos($str, '<' .$tag, $open strlen($tag) 3);
- $countOpen ;
- }
- }
- if ($close === false) {//标签闭합불配对
- $innerContentCollect[] = substr($str, $r 1);
- $error = true;
- break;
- }
- $end = $close;
- $r = strpos($ str, '>', $start);
- $innerContentCollect[] = substr($str, $r 1, $end - $r - 1);
- }
-
- return !$ error;
- }
-
- /**
- * _removeNoise
- *
- * @param string &$str 구문 분석할 태그 문자열.
- *
- * @access private
- *
- * @return string
- */
- 비공개 함수 _removeNoise(&$str)
- {
- $str = preg_replace('~~is', '', $str);
- $str = preg_replace('~~is', ' ', $str);
- $str = preg_replace('~~is', '', $str);
- }
-
- /**
- * parseSelectors
- *
- * @param string $selectors 사용자 선택 조건
- * @param array &$selectorsTag 태그
- * @param array &$selectorsAttr 속성
- *
- * @액세스 보호됨
- *
- * @return null
- */
- 보호 함수 parseSelectors($selectors, array &$selectorsTag, array &$selectorsAttr)
- {
- preg_match_all('~([wd] )([[wd -="._/ ] ])?~', $selectors, $matches);
- $selectorsTag = $matches[1];
- foreach ($matches[2] as $key => $value) {
- $selectorsAttr[$key] = array();
- if ($value !== '') {
- preg_match_all('~([wd-] )="([wd -. _/] )"~', $value, $matches);
- foreach ($matches[1] as $index => $attr) {
- $selectorsAttr[$key][$attr] = $matches[2][$index];
- }
- }
- }
- }
-
- /**
- * find
- *
- * @param 혼합 $selectors 사용자 선택 조건.
- * @param array $selectorsTag 태그.
- * @param array $selectorsAttr 속성.
- *
- * @access public
- *
- * @return 배열
- */
- 공개 함수 find($selectors , $selectorsTag = array(), $selectorsAttr = array())
- {
- if ($selectors !== null) {
- $this->parseSelectors($selectors, $selectorsTag, $selectorsAttr );
- }
- var_dump($selectorsTag, $selectorsAttr);exit();
- if (!empty($selectorsTag)) {
- $this->seek($selectorsTag, $selectorsAttr );
- foreach ($this->child as $key => $node) {
- $node->find(null, $selectorsTag, $selectorsAttr);
- }
- }
-
- if ($selectors !== null) {
- $res = self::$FoundNode;
- self::$FoundNode = array();
-
- return $res ;
- }
- }
-
- /**
- * findGlobal
- *
- * @param string $selectors 사용자 선택 조건
- *
- * @access public
- *
- * @return 배열
- */
- 공개 함수 findGlobal($selectors)
- {
- $space = strpos($selectors, ' ', strpos($selectors, ']' ));
- if ($space === false) {
- return $this->findOneGlobal($selectors);
- } else {
- $selectorsAttr = array();
- $selectorsTag = array();
- $this->findOneGlobal(substr($selectors, 0, $space), false);
- $this->parseSelectors(substr($selectors, $space 1) , $selectorsTag, $selectorsAttr);
- if (!empty(self::$FoundNode) && !empty($selectorsTag)) {
- $nodes = self::$FoundNode;
- self::$ FoundNode = array();
- foreach ($nodes as $key => $node) {
- $node->seek($selectorsTag, $selectorsAttr);
- }
- }
-
-
- }
-
- $res = self::$FoundNode;
- self::$FoundNode = array();
-
- return $res;
- }
-
- /**
- * 탐색
- *
- * @param 배열 $selectorsTag 태그.
- * @param 배열 $selectorsAttr 속성.
- *
- * @access protected
- *
- * @return null
- */
- 보호된 함수 탐색($selectorsTag, $selectorsAttr)
- {
- foreach ($this->child as $key => $node) {
- $isFind = true;
- if ($node->tag === $selectorsTag[0]) {
- foreach ($selectorsAttr[0] as $attrName => $ value) {
-
- if (isset($node->attr[$attrName])
- && (preg_match('~.*? '.$value.' .*?~', $node ->attr[$attrName]) > 0
- || preg_match('~^'.$value.'$~', $node->attr[$attrName]) > 0
- | | preg_match('~^'.$value.' ~', $node->attr[$attrName]) > 0
- || preg_match('~ '.$value.'$~', $node ->attr[$attrName]) > 0)
- ) {
- 계속;
- } else {
- $isFind = false;
- break;
- }
- }
- } else {
- $isFind = false;
- }
- if ($isFind) {
- if (count($selectorsTag) === 1) {
- self::$ FoundNode[] = $node;
- } else {
- $node->seek(
- array_slice($selectorsTag, 1),
- array_slice($selectorsAttr, 1)
- );
- }
- }
- }
- }
-
- /**
- * findOneGlobal
- *
- * @param string $selector 사용자의 선택 조건.
- * @param bool $isReturn 날씨 반환 값.
- *
- * @access public
- *
- * @return 배열
- */
- 공개 함수 findOneGlobal($selector, $isReturn = true)
- {
- preg_match('~([wd] )([[wd -="._/] ])?~', $selector, $matches);
- $tag = $matches[1];
- $ attr = array();
- if (isset($matches[2])) {
- preg_match_all('~([wd-] )="([wd-. _/] )"~', $matches[2], $matches);
- foreach ($matches[1] as $key => $value) {
- $attr[$value] = $matches [2][$key];
- }
- }
- if (isset(self::$TagSet[$tag])) {
- foreach (self::$TagSet[$tag] as $attrValue => $nodeArray) {
- $isFind = true;
- foreach ($attr as $attrName => $value) {
- if (preg_match('~'.$attrName.'= ".*? '.$value.' .*?"~', $attrValue)
- || preg_match('~'.$attrName.'="'.$value.' .*?"~', $attrValue)
- || preg_match('~'.$attrName.'=".*? '.$value.'"~', $attrValue)
- || preg_match('~'.$attrName. '="'.$value.'"~', $attrValue)
- ) {
- 계속;
- } else {
- $isFind = false;
- break;
- }
- }
- if ($isFind) {
- foreach ($nodeArray as $key => $node) {
- self::$FoundNode[] = $node;
- }
- }
- }
- }
- if ($isReturn) {
- $res = self ::$FoundNode;
- self::$FoundNode = array();
-
- return $res;
- }
- }
- }
-
- /**
- * TagDomNode
- *
- * @uses TagDomRoot
- *
- * @category TagParse
- * @package TagParse
- * @author kun
- * @copyright 2014군
- * @license http://www.php.net/license/3_01.txt PHP 라이센스 3.01
- * @version 1.0
- * @link http://www. blogkun.com
- * @since 1.0
- */
- class TagDomNode는 TagDomRoot를 확장합니다
- {
- public $attr = array();
- public $parent = null;
-
- /**
- * __construct
- *
- * @param 혼합 $tag 태그.
- * @param 혼합 $parent 상위 노드.
- * @param 혼합 $attr 속성.
- * @param 혼합 $innerContent 태그 콘텐츠.
- * @param 혼합 $level 노드 수준.
- *
- * @access public
- *
- * @return TagDomNode
- */
- 공개 함수 __construct($tag, $parent, $attr, $innerContent, $level)
- {
- $this->tag = $tag;
- $this->parent = $parent;
- $this->_parseAttr($attr);
- $this->level = $level;
-
- $l = strpos($ innerContent, '<');
- if ($l !== false) {
- $this->plaintext = substr($innerContent, 0, $l);
- }
- $ res = preg_match_all('~>(.*?)<~s', $innerContent, $matches);
- if ($res !== false && $res > 0) {
- $this ->plaintext .= implode($matches[1]);
- } else {
- $this->plaintext .= $innerContent;
- }
- $r = strrpos($innerContent, '>');
- if ($r !== false) {
- $this->plaintext .= substr($innerContent, $r 1);
- }
-
- $tagCollect = array();
- $attrCollect = array();
- $innerContentCollect = array();
-
- if ($this->parseTag($innerContent, $tagCollect, $attrCollect , $innerContentCollect) === false) {
- self::$TagParseError = true;
- }
-
- foreach ($tagCollect as $index => $tag) {
- $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level 1);
- }
-
- if (!isset(self::$TagSet[$this->tag])) {
- self::$TagSet[$this->tag] = array();
- }
- if (!isset(self::$TagSet[$this->tag][$attr])) {
- self::$TagSet[$this->tag][$ attr] = array();
- }
- self::$TagSet[$this->tag][$attr][] = &$this;
- }
-
- /* *
- * _parseAttr
- *
- * @param string $str 속성 문자열.
- *
- * @access public
- *
- * @return null
- */
- 비공개 함수 _parseAttr($str)
- {
- preg_match_all('~(?[w-] )="(?.*?) "~s', $str, $matches);
- foreach ($matches['attrName'] as $key => $value) {
- $this->attr[$value] = $matches ['attrValue'][$key];
- }
- }
- }
-
复代码
|