특정 URL 검색 웹 페이지에서 일부 키워드를 추출할 수 있습니다
예를 들어 Code Zhuji 홈페이지에서는 아래 그림과 유사한 키워드를 추출할 수 있습니다.
-
- if(!empty($_REQUEST["url"])){
-
- include 'class.keywords.php ';
-
- $keywords = 새로운 키워드ugest();
- $keywords->_lang = 'es';
- $keywords->_encoding = 'iso-8859-1';
- $keywords->_catego = 'telecom';
- $keywords->_keyCount = 100; // %
- $keywords->file($_REQUEST['url']);
-
- #$keywords->readMetaKeyWords();
-
- #$keywords->readHtmlKeyWords();
-
- $keywords->readAll();
-
- echo '발견된 키워드:';
-
- $i = 1;
-
- foreach($keywords->get() as $word ) echo $i .". $word
";
- }
- //url 예: http://www.codepearl.com
- echo "";
-
-
- ?>
코드 복사
-
- class keywordsugest{
-
- var $_html = FALSE;
- var $_keyCount = 5; _keyWords = array();
- var $_encoding = 'UTF-8';
- var $_lang = 'es';
- var $_catego = '電信';
- var $_url = '' ;
-
- /**
- * # 讀取元關鍵字
- *
- */
- public function readMetaKeyWords() {
-
- if (! $this->_html) return ;
-
- preg_match(' /"]*)"?[s]*[/]?[s]*>/is', $this->_html, $match);
-
- //$tags = get_meta_tags($ this->_url);
- //echo $tags['keywords']
-
-
- if (count($match) ) {
- $this->_keyWords = array_unique(explode( ',', preg_replace('/s/i', ' ', mb_strtolower($match[1], $this->_encoding)))) ;
- }
- }
-
- /* *
- * 剝離標籤
- *
- * @param mix $string
- */
- private function rip_tags($string) {
-
- // ----- 刪除HTML 標籤-----
- $string = preg_replace ('/]*>/', ' ', $string);
-
- /* // -----刪除控製字元-----
- $string = str_replace("r", '', $string); // --- 替換為空格
- $string = str_replace("n", ' ' , $字串); // --- 替換為空格
- $string = str_replace ("t", ' ', $string); // --- 替換為空格
- */
- // ----- 刪除多個空格-----
- $string = trim(preg_replace('/ {2,}/', ' ', $string));
-
- return $string;
-
- }
-
- /**
- * # 從頁面正文或字串中讀取關鍵字
- *
- */
- public function readHtmlKeyWords() {
-
- if (! $this->_html) return;
-
- if(!empty($this->_keyWords)){
- $implo = implode(' ',$this- >_keyWords);
- $this->_html = $this->_html." ".$implo;
- $this->_keyWords = array();
- }
-
- $this->_html = str_replace(' ',' ', $this->_html);
-
- # 刪除不需要的部分
- $toRemove = array( 'head', ' script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
-
- foreach ($toRemove as $remove ) $this-> ;_html = preg_replace("/
.*?/is", ' ', $this->_html);-
- # 刪除註解
- $this->_html = preg_replace("/
/is", ' ', $this->_html); -
- # 刪除html標籤
- $this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
-
- $this->_html = htmlspecialchars_decodeialchars_decode ($this->_html);
-
- # 解碼編碼的hmtl 實體
- $this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
-
- # 分解為單字
- $words = preg_split("/[s] |[t] |[.] |[,] |[:] |[; ] |[!] |[?] |[| ] /s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);
-
- if (count($words)) {
-
- $Frequency = array_count_values($words); unset($Frequency['']);
-
- if (count($Frequency)) {
-
- # 刪除停用字與標點符號
- include('stopwords_'.$this ->_lang.'.php');
- include('glodic_' .$this->_catego.'_'.$this->_lang.'.php');
-
- $punct = '~!@#$%^&*()_ |}{ [];:'",<.>/?`-=\';
-
- foreach (array_keys($Frequency) as $word ) {
- if ( (in_array($word, $ stopWords)) 或(strspn($word, $punct) == strlen($word)) ){ unset($Frequency[$word]); }
- }
-
- $max = max($頻率);
- $count = count($Frequency);
- $tot = round(($max * 100) / $count) ;
- $tot2 = round(($this->_keyCount * 100) / $count);
- if($tot >; $count){$tot = $tot / 2;}
- if( $tot2 > $count){$tot = $tot / 2;}
- $showmax = round(($tot $tot2) / 2);
-
- foreach (array_keys($Frequency) as $word) {
- if ( in_array($word, $ glodic) ) {$頻率[$word] = $頻率[$word] $showmax; }
- }
-
- # 依頻率排序
- arsort($Frequency, SORT_NUMERIC);
-
- # 將它們加到關鍵字數組
- $i = 0;
-
- foreach ($Frequency as $word=>$count) {
-
- if ((!in_array($ word, $this->_keyWords)) &&
- (!is_numeric($word) ) &&
- (!empty($word)) ) {
- $this->_keyWords[] = ( string)$word;
-
- $i ;
-
- if ( $i == $showmax) 中斷;
- }
- }
- }
- }
- }
-
-
- /**
- * 更改預設的 utf-8
- *
- * @param mix $enc
- 編碼*/
- private函數編碼($enc = FALSE) {
-
- if ($enc) $this->_encoding = $enc ;
- }
-
-
- /**
- * # 從檔案或網址讀取
- *
- * @param mix $fileUrl
- */
- 公用函數檔($fileUrl = FALSE) {
-
- if ($fileUrl){ $this- > ;_html = @file_get_contents($fileUrl);
- $this->_url = $fileUrl;
- }
- }
-
-
-
- /**
-
-
-
- /**
- * # 定義 html 為字串
- *
- * @param mix $page
- * /
- public function html($page = FALSE) {
-
- if ($page) $this->_html = $page;
- }
-
-
- /* *
- * # 讀取元關鍵字與正文
- *
- */
- public function readAll() {
-
- if ($this->_html !== FALSE) {
-
- $this->readMetaKeyWords();
-
- $this->readHtmlKeyWords();
- }
-
- $this->_keyWords = array_unique($this->_keyWords);
- }
-
-
-
- }
-
-
- **
- * # 以陣列形式傳回關鍵字
- *
*/ public function get() { return $this->_keyWords; } | }