RMM word segmentation algorithm class

WBOY
Release: 2016-07-25 08:47:54
Original
970 people have browsed it
RMM word segmentation algorithm class
  1. //RMM word segmentation algorithm
  2. class SplitWord{
  3. var $TagDic = Array();
  4. var $RankDic = Array();
  5. var $SourceStr = '';
  6. var $ResultStr = '';
  7. var $ SplitChar = ' '; //Separator
  8. var $SplitLen = 4; //Reserved word length
  9. var $MaxLen = 7; //The largest Chinese character in the dictionary, the value here is the maximum index of the byte array
  10. var $MinLen = 3; //The minimum Chinese character, the value here is the maximum index of the byte array
  11. function SplitWord(){
  12. $this->__construct();
  13. }
  14. function __construct(){
  15. //Advanced word segmentation, Preload the dictionary to improve word segmentation speed
  16. $dicfile = dirname(__FILE__)."/ppldic.csv";
  17. $fp = fopen($dicfile,'r'); //Read words in the dictionary
  18. while ($line = fgets($fp,256)){
  19. $ws = explode(' ',$line); //Split the words in the vocabulary
  20. $this->TagDic[$ws[0] ] = $ws[1];
  21. $this->RankDic[strlen($ws[0])][$ws[0]] = $ws[2];
  22. }
  23. fclose($fp); // Close the dictionary file
  24. }
  25. //Extract resources
  26. function Clear(){
  27. @fclose($this->QuickDic);
  28. }
  29. //Set the source string
  30. function SetSource($str){
  31. $this->SourceStr = $this->UpdateStr($str);
  32. $this->ResultStr = "";
  33. }
  34. //Check whether the string does not exist in Chinese
  35. function NotGBK($str)
  36. {
  37. if($str=="") return "";
  38. if( ord($str[0])>0x80 ) return false;
  39. else return true;
  40. }
  41. //RMM word segmentation algorithm
  42. function SplitRMM ($str=""){
  43. if($str!="") $this->SetSource($str);
  44. if($this->SourceStr=="") return "";
  45. $this ->SourceStr = $this->UpdateStr($this->SourceStr);
  46. $spwords = explode(" ",$this->SourceStr);
  47. $spLen = count($spwords);
  48. $spc = $this->SplitChar;
  49. for($i=($spLen-1);$i>=0;$i--){
  50. if($spwords[$i]=="") continue;
  51. if($this->NotGBK($spwords[$i])){
  52. if(preg_match("/[^0-9.+-]/",$spwords[$i]))
  53. { $this- >ResultStr = $spwords[$i].$spc.$this->ResultStr; }
  54. else
  55. {
  56. $nextword = "";
  57. @$nextword = substr($this->ResultStr,0,strpos ($this->ResultStr,""));
  58. }
  59. }
  60. else
  61. {
  62. $c = $spwords[$i][0].$spwords[$i][1];
  63. $n = hexdec (bin2hex($c));
  64. if(strlen($spwords[$i]) <= $this->SplitLen)
  65. {
  66. }
  67. else
  68. {
  69. $this->ResultStr = $this-> ;RunRMM($spwords[$i]).$spc.$this->ResultStr;
  70. }
  71. }
  72. }
  73. return $this->ResultStr;
  74. }
  75. //Reverse matching method for all Chinese strings Decompose
  76. function RunRMM($str){
  77. $spc = $this->SplitChar;
  78. $spLen = strlen($str);
  79. $rsStr = "";
  80. $okWord = "";
  81. $tmpWord = "" ;
  82. $WordArray = Array();
  83. //Reverse dictionary matching
  84. for($i=($spLen-1);$i>=0;){
  85. //When i reaches the smallest possible word
  86. if( $i<=$this->MinLen){
  87. if($i==1){
  88. $WordArray[] = substr($str,0,2);
  89. }else
  90. {
  91. $w = substr($ str,0,$this->MinLen+1);
  92. if($this->IsWord($w)){
  93. $WordArray[] = $w;
  94. }else{
  95. $WordArray[] = substr( $str,2,2);
  96. $WordArray[] = substr($str,0,2);
  97. }
  98. }
  99. $i = -1; break;
  100. }
  101. //Analyze the situation above the minimum word
  102. if($i>=$this->MaxLen) $maxPos = $this->MaxLen;
  103. else $maxPos = $i;
  104. $isMatch = false;
  105. for($j=$maxPos;$j> =0;$j=$j-2){
  106. $w = substr($str,$i-$j,$j+1);
  107. if($this->IsWord($w)){
  108. $ WordArray[] = $w;
  109. $i = $i-$j-1;
  110. $isMatch = true;
  111. break;
  112. }
  113. }
  114. }
  115. $rsStr = $this->otherword($WordArray);
  116. return $rsStr;
  117. }
  118. function otherword($WordArray){
  119. $wlen = count($WordArray)-1; //Calculate the number of elements in the array
  120. $rsStr = ""; //Initialize variables
  121. $spc = $this->SplitChar;
  122. for($i=$wlen;$i>=0;$i--)
  123. {
  124. $rsStr .= $spc.$WordArray[$i].","; // Split the array into commas
  125. }
  126. //Return the word segmentation results of this paragraph
  127. $rsStr = preg_replace("/^".$spc."/",",",$rsStr);
  128. return $rsStr;
  129. }
  130. //Determine whether a certain word exists in the dictionary
  131. function IsWord($okWord){
  132. $slen = strlen($okWord);
  133. if($slen > $this->MaxLen) return false;
  134. else return isset($this->RankDic[$slen][$okWord]);
  135. }
  136. //Organize the string (preliminary processing of punctuation marks, mixed Chinese and English, etc.)
  137. function UpdateStr($str){
  138. $ spc = $this->SplitChar;
  139. $slen = strlen($str);
  140. if($slen==0) return '';
  141. $okstr = '';
  142. $prechar = 0; // 0-blank 1-English 2-Chinese 3-Symbol
  143. for($i=0;$i<$slen;$i++){
  144. if(ord($str[$i]) < 0x81){
  145. //Blank in English Symbol
  146. if(ord($str[$i]) < 33){
  147. if($prechar!=0&&$str[$i]!="r"&&$str[$i]!="n") $okstr .= $spc;
  148. $prechar=0;
  149. continue;
  150. }else if(preg_match("/[^0-9a-zA-Z@.%#:\&_-]/",$str[$ i])){
  151. if($prechar==0){ $okstr .= $str[$i]; $prechar=3;}
  152. else{ $okstr .= $spc.$str[$i]; $ prechar=3;}
  153. }else{
  154. if($prechar==2||$prechar==3)
  155. { $okstr .= $spc.$str[$i]; $prechar=1;}
  156. else
  157. {
  158. if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
  159. else { $okstr .= $str [$i]; $prechar=1; }
  160. }
  161. }
  162. }
  163. else{
  164. //If the previous character is non-Chinese and non-space, add a space
  165. if($prechar!=0 && $prechar! =2) $okstr .= $spc;
  166. //If Chinese characters
  167. if(isset($str[$i+1])){
  168. $c = $str[$i].$str[$i+1 ];
  169. $n = hexdec(bin2hex($c));
  170. if($n<0xA13F && $n > 0xAA40){
  171. if($prechar!=0) $okstr .= $spc.$c;
  172. else $okstr .= $c;
  173. $prechar = 3;
  174. }
  175. else{
  176. $okstr .= $c;
  177. $prechar = 2;
  178. }
  179. $i++;
  180. }
  181. }
  182. }
  183. return $okstr ;
  184. }
  185. }
  186. // Call
  187. $split=new SplitWord();
  188. echo $split->SplitRMM("php search technology");
  189. // Note that the format of ppldic.csv dictionary is word + Space+Number+n
Copy code


Related labels:
source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Popular Tutorials
More>
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template