요청한 페이지에서 키워드 추출-PHP 튜토리얼-php.cn

요청한 페이지에서 키워드 추출

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB

풀어 주다： 2016-07-25 08:49:34

원래의

1159명이 탐색했습니다.

}

특정 URL 검색 웹 페이지에서 일부 키워드를 추출할 수 있습니다

예를 들어 Code Zhuji 홈페이지에서는 아래 그림과 유사한 키워드를 추출할 수 있습니다. 요청한 페이지에서 키워드 추출

if(!empty($_REQUEST["url"])){
include 'class.keywords.php ';
$keywords = 새로운 키워드ugest();
$keywords->_lang = 'es';
$keywords->_encoding = 'iso-8859-1';
$keywords->_catego = 'telecom';
$keywords->_keyCount = 100; // %
$keywords->file($_REQUEST['url']);
#$keywords->readMetaKeyWords();
#$keywords->readHtmlKeyWords();
$keywords->readAll();
echo '발견된 키워드:

';
$i = 1;
foreach($keywords->get() as $word ) echo $i .". $word
";
}
//url 예: http://www.codepearl.com
echo "
";
?>

코드 복사

class keywordsugest{
var $_html = FALSE;
var $_keyCount = 5; _keyWords = array();
var $_encoding = 'UTF-8';
var $_lang = 'es';
var $_catego = '電信';
var $_url = '' ;
/**
* # 讀取元關鍵字
*
*/
public function readMetaKeyWords() {
if (! $this->_html) return ;
preg_match(' /"]*)"?[s]*[/]?[s]*>/is', $this->_html, $match);
//$tags = get_meta_tags($ this->_url);
//echo $tags['keywords']
if (count($match) ) {
$this->_keyWords = array_unique(explode( ',', preg_replace('/s/i', ' ', mb_strtolower($match[1], $this->_encoding)))) ;
}
}
/* *
* 剝離標籤
*
* @param mix $string
*/
private function rip_tags($string) {
// ----- 刪除HTML 標籤-----
$string = preg_replace ('/]*>/', ' ', $string);
/* // -----刪除控製字元-----
$string = str_replace("r", '', $string); // --- 替換為空格
$string = str_replace("n", ' ' , $字串); // --- 替換為空格
$string = str_replace ("t", ' ', $string); // --- 替換為空格
*/
// ----- 刪除多個空格-----
$string = trim(preg_replace('/ {2,}/', ' ', $string));
return $string;
}
/**
* # 從頁面正文或字串中讀取關鍵字
*
*/
public function readHtmlKeyWords() {
if (! $this->_html) return;
if(!empty($this->_keyWords)){
$implo = implode(' ',$this- >_keyWords);
$this->_html = $this->_html." ".$implo;
$this->_keyWords = array();
}
$this->_html = str_replace(' ',' ', $this->_html);
# 刪除不需要的部分
$toRemove = array( 'head', ' script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
foreach ($toRemove as $remove ) $this-> ;_html = preg_replace("/~~.*?~~/is", ' ', $this->_html);~~~~

# 刪除註解

$this->_html = preg_replace("/~~/is", ' ', $this->_html);~~

# 刪除html標籤

$this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);

$this->_html = htmlspecialchars_decodeialchars_decode ($this->_html);

# 解碼編碼的hmtl 實體

$this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);

# 分解為單字

$words = preg_split("/[s] |[t] |[.] |[,] |[:] |[; ] |[!] |[?] |[| ] /s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);

if (count($words)) {

$Frequency = array_count_values($words); unset($Frequency['']);

if (count($Frequency)) {

# 刪除停用字與標點符號

include('stopwords_'.$this ->_lang.'.php');

include('glodic_' .$this->_catego.'_'.$this->_lang.'.php');

$punct = '~!@#$%^&*()_ |}{ [];:'",<.>/?`-=\';

foreach (array_keys($Frequency) as $word ) {

if ( (in_array($word, $ stopWords)) 或(strspn($word, $punct) == strlen($word)) ){ unset($Frequency[$word]); }

}

$max = max($頻率);

$count = count($Frequency);

$tot = round(($max * 100) / $count) ;

$tot2 = round(($this->_keyCount * 100) / $count);

if($tot >; $count){$tot = $tot / 2;}

if( $tot2 > $count){$tot = $tot / 2;}

$showmax = round(($tot $tot2) / 2);

foreach (array_keys($Frequency) as $word) {

if ( in_array($word, $ glodic) ) {$頻率[$word] = $頻率[$word] $showmax; }

}

# 依頻率排序

arsort($Frequency, SORT_NUMERIC);

# 將它們加到關鍵字數組

$i = 0;

foreach ($Frequency as $word=>$count) {

if ((!in_array($ word, $this->_keyWords)) &&

(!is_numeric($word) ) &&

(!empty($word)) ) {

$this->_keyWords[] = ( string)$word;

$i ;

if ( $i == $showmax) 中斷;

}

}

}

}

}

/**

* 更改預設的 utf-8

*

* @param mix $enc

編碼*/

private函數編碼($enc = FALSE) {

if ($enc) $this->_encoding = $enc ;

}

/**

* # 從檔案或網址讀取

*

* @param mix $fileUrl

*/

公用函數檔($fileUrl = FALSE) {

if ($fileUrl){ $this- > ;_html = @file_get_contents($fileUrl);

$this->_url = $fileUrl;

}

}

/**

/**

* # 定義 html 為字串

*

* @param mix $page

* /

public function html($page = FALSE) {

if ($page) $this->_html = $page;

}

/* *

* # 讀取元關鍵字與正文

*

*/

public function readAll() {

if ($this->_html !== FALSE) {

$this->readMetaKeyWords();

$this->readHtmlKeyWords();

}

$this->_keyWords = array_unique($this->_keyWords);

}

}

**

* # 以陣列形式傳回關鍵字

*

public function get() { return $this->_keyWords;

}

複製程式碼