Rumah > pembangunan bahagian belakang > tutorial php > 从请求的页面提取关键词


Lepaskan: 2016-07-25 08:49:34
1169 orang telah melayarinya

例如从代码珠玑的首页可以提取出类似下面图片中的关键词 从请求的页面提取关键词 从请求的页面提取关键词
  1. if(!empty($_REQUEST["url"])){
  2. include 'class.keywords.php';
  3. $keywords = new keywordsugest();
  4. $keywords->_lang = 'es';
  5. $keywords->_encoding = 'iso-8859-1';
  6. $keywords->_catego = 'telecom';
  7. $keywords->_keyCount = 100; // is like the porcent %
  8. $keywords->file($_REQUEST['url']);
  9. #$keywords->readMetaKeyWords();
  10. #$keywords->readHtmlKeyWords();
  11. $keywords->readAll();
  12. echo 'Keywords found :';
  13. $i = 1;
  14. foreach($keywords->get() as $word) echo $i++.". $word
  15. }
  16. //url例如:
  17. echo "
  18. ?>
  1. class keywordsugest{
  2. var $_html = FALSE;
  3. var $_keyCount = 5;
  4. var $_keyWords = array();
  5. var $_encoding = 'UTF-8';
  6. var $_lang = 'es';
  7. var $_catego = 'telecom';
  8. var $_url = '';
  9. /**
  10. * # read meta keywords
  11. *
  12. */
  13. public function readMetaKeyWords() {
  14. if (! $this->_html) return;
  15. preg_match('/"]*)\"?[\s]*[\/]?[\s]*>/is', $this->_html, $match);
  16. //$tags = get_meta_tags($this->_url);
  17. //echo $tags['keywords'];
  18. if (count($match)) {
  19. $this->_keyWords = array_unique(explode(',', preg_replace('/\s/i', ' ', mb_strtolower($match[1], $this->_encoding))));
  20. }
  21. }
  22. /**
  23. * strip tags
  24. *
  25. * @param mixed $string
  26. */
  27. private function rip_tags($string) {
  28. // ----- remove HTML TAGs -----
  29. $string = preg_replace ('/]*>/', ' ', $string);
  30. /* // ----- remove control characters -----
  31. $string = str_replace("\r", '', $string); // --- replace with empty space
  32. $string = str_replace("\n", ' ', $string); // --- replace with space
  33. $string = str_replace("\t", ' ', $string); // --- replace with space
  34. */
  35. // ----- remove multiple spaces -----
  36. $string = trim(preg_replace('/ {2,}/', ' ', $string));
  37. return $string;
  38. }
  39. /**
  40. * # read keywords from page body or string
  41. *
  42. */
  43. public function readHtmlKeyWords() {
  44. if (! $this->_html) return;
  45. if(!empty($this->_keyWords)){
  46. $implo = implode(' ',$this->_keyWords);
  47. $this->_html = $this->_html." ".$implo;
  48. $this->_keyWords = array();
  49. }
  50. $this->_html = str_replace(' ',' ', $this->_html);
  51. # remove unneeded parts
  52. $toRemove = array('head', 'script', 'style', 'object', 'embed', 'noembed', 'applet', 'noframes', 'noscript');
  53. foreach ($toRemove as $remove) $this->_html = preg_replace("/\.*?\/is", ' ', $this->_html);
  54. # remove comments
  55. $this->_html = preg_replace("/\/is", ' ', $this->_html);
  56. # delete html tags
  57. $this->_html = mb_strtolower($this->rip_tags($this->_html), $this->_encoding);
  58. $this->_html = htmlspecialchars_decode($this->_html);
  59. # decode encoded hmtl entities
  60. $this->_html = html_entity_decode ($this->_html, ENT_COMPAT, $this->_encoding);
  61. # break into words
  62. $words = preg_split("/[\s]+|[\t]+|[\.]+|[\,]+|[\:]+|[\;]+|[\!]+|[\?]+|[\|]+/s", $this->_html, -1, PREG_SPLIT_NO_EMPTY);
  63. if (count($words)) {
  64. $frequency = array_count_values($words);
  65. unset($frequency['']);
  66. if (count($frequency)) {
  67. # delete stop words and interpunctions
  68. include('stopwords_'.$this->_lang.'.php');
  69. include('glodic_'.$this->_catego.'_'.$this->_lang.'.php');
  70. $punct = '~!@#$%^&*()_+|}{[];:\'\",<.>/?`-=\\';
  71. foreach (array_keys($frequency) as $word) {
  72. if ( (in_array($word, $stopWords)) or (strspn($word, $punct) == strlen($word)) ){ unset($frequency[$word]); }
  73. }
  74. $max = max($frequency);
  75. $count = count($frequency);
  76. $tot = round(($max * 100) / $count);
  77. $tot2 = round(($this->_keyCount * 100) / $count);
  78. if($tot > $count){$tot = $tot / 2;}
  79. if($tot2 > $count){$tot = $tot / 2;}
  80. $showmax = round(($tot + $tot2) / 2);
  81. foreach (array_keys($frequency) as $word) {
  82. if ( in_array($word, $glodic) ){$frequency[$word] = $frequency[$word] + $showmax; }
  83. }
  84. # sort by frequency
  85. arsort($frequency, SORT_NUMERIC);
  86. # add them to keyword array
  87. $i = 0;
  88. foreach ($frequency as $word=>$count) {
  89. if ( (! in_array($word, $this->_keyWords)) &&
  90. (! is_numeric($word)) &&
  91. (! empty($word)) ) {
  92. $this->_keyWords[] = (string)$word;
  93. $i++;
  94. if ($i == $showmax) break;
  95. }
  96. }
  97. }
  98. }
  99. }
  100. /**
  101. * change the encoding from default utf-8
  102. *
  103. * @param mixed $enc
  104. */
  105. private function encoding($enc = FALSE) {
  106. if ($enc) $this->_encoding = $enc;
  107. }
  108. /**
  109. * # reads from file or url
  110. *
  111. * @param mixed $fileUrl
  112. */
  113. public function file($fileUrl = FALSE) {
  114. if ($fileUrl){ $this->_html = @file_get_contents($fileUrl);
  115. $this->_url = $fileUrl;
  116. }
  117. }
  118. /**
  119. * # define html as string
  120. *
  121. * @param mixed $page
  122. */
  123. public function html($page = FALSE) {
  124. if ($page) $this->_html = $page;
  125. }
  126. /**
  127. * # reads both meta keywords and from body
  128. *
  129. */
  130. public function readAll() {
  131. if ($this->_html !== FALSE) {
  132. $this->readMetaKeyWords();
  133. $this->readHtmlKeyWords();
  134. }
  135. $this->_keyWords = array_unique($this->_keyWords);
  136. }
  137. /**
  138. * # returns keywords as array
  139. *
  140. */
  141. public function get() {
  142. return $this->_keyWords;
  143. }
  144. }
  145. ?>

Kenyataan Laman Web ini
Kandungan artikel ini disumbangkan secara sukarela oleh netizen, dan hak cipta adalah milik pengarang asal. Laman web ini tidak memikul tanggungjawab undang-undang yang sepadan. Jika anda menemui sebarang kandungan yang disyaki plagiarisme atau pelanggaran, sila hubungi
Tutorial Popular
Muat turun terkini
kesan web
Kod sumber laman web
Bahan laman web
Templat hujung hadapan