RMM word segmentation algorithm class-PHP Tutorial-php.cn

RMM word segmentation algorithm class

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB

Release： 2016-07-25 08:47:54

Original

1022 people have browsed it

RMM word segmentation algorithm class

//RMM word segmentation algorithm
class SplitWord{
var $TagDic = Array();
var $RankDic = Array();
var $SourceStr = '';
var $ResultStr = '';
var $ SplitChar = ' '; //Separator
var $SplitLen = 4; //Reserved word length
var $MaxLen = 7; //The largest Chinese character in the dictionary, the value here is the maximum index of the byte array
var $MinLen = 3; //The minimum Chinese character, the value here is the maximum index of the byte array
function SplitWord(){
$this->__construct();
}
function __construct(){
//Advanced word segmentation, Preload the dictionary to improve word segmentation speed
$dicfile = dirname(__FILE__)."/ppldic.csv";
$fp = fopen($dicfile,'r'); //Read words in the dictionary
while ($line = fgets($fp,256)){
$ws = explode(' ',$line); //Split the words in the vocabulary
$this->TagDic[$ws[0] ] = $ws[1];
$this->RankDic[strlen($ws[0])][$ws[0]] = $ws[2];
}
fclose($fp); // Close the dictionary file
}
//Extract resources
function Clear(){
@fclose($this->QuickDic);
}
//Set the source string
function SetSource($str){
$this->SourceStr = $this->UpdateStr($str);
$this->ResultStr = "";
}
//Check whether the string does not exist in Chinese
function NotGBK($str)
{
if($str=="") return "";
if( ord($str[0])>0x80 ) return false;
else return true;
}
//RMM word segmentation algorithm
function SplitRMM ($str=""){
if($str!="") $this->SetSource($str);
if($this->SourceStr=="") return "";
$this ->SourceStr = $this->UpdateStr($this->SourceStr);
$spwords = explode(" ",$this->SourceStr);
$spLen = count($spwords);
$spc = $this->SplitChar;
for($i=($spLen-1);$i>=0;$i--){
if($spwords[$i]=="") continue;
if($this->NotGBK($spwords[$i])){
if(preg_match("/[^0-9.+-]/",$spwords[$i]))
{ $this- >ResultStr = $spwords[$i].$spc.$this->ResultStr; }
else
{
$nextword = "";
@$nextword = substr($this->ResultStr,0,strpos ($this->ResultStr,""));
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec (bin2hex($c));
if(strlen($spwords[$i]) <= $this->SplitLen)
{
}
else
{
$this->ResultStr = $this-> ;RunRMM($spwords[$i]).$spc.$this->ResultStr;
}
}
}
return $this->ResultStr;
}
//Reverse matching method for all Chinese strings Decompose
function RunRMM($str){
$spc = $this->SplitChar;
$spLen = strlen($str);
$rsStr = "";
$okWord = "";
$tmpWord = "" ;
$WordArray = Array();
//Reverse dictionary matching
for($i=($spLen-1);$i>=0;){
//When i reaches the smallest possible word
if( $i<=$this->MinLen){
if($i==1){
$WordArray[] = substr($str,0,2);
}else
{
$w = substr($ str,0,$this->MinLen+1);
if($this->IsWord($w)){
$WordArray[] = $w;
}else{
$WordArray[] = substr( $str,2,2);
$WordArray[] = substr($str,0,2);
}
}
$i = -1; break;
}
//Analyze the situation above the minimum word
if($i>=$this->MaxLen) $maxPos = $this->MaxLen;
else $maxPos = $i;
$isMatch = false;
for($j=$maxPos;$j> =0;$j=$j-2){
$w = substr($str,$i-$j,$j+1);
if($this->IsWord($w)){
$ WordArray[] = $w;
$i = $i-$j-1;
$isMatch = true;
break;
}
}
}
$rsStr = $this->otherword($WordArray);
return $rsStr;
}
function otherword($WordArray){
$wlen = count($WordArray)-1; //Calculate the number of elements in the array
$rsStr = ""; //Initialize variables
$spc = $this->SplitChar;
for($i=$wlen;$i>=0;$i--)
{
$rsStr .= $spc.$WordArray[$i].","; // Split the array into commas
}
//Return the word segmentation results of this paragraph
$rsStr = preg_replace("/^".$spc."/",",",$rsStr);
return $rsStr;
}
//Determine whether a certain word exists in the dictionary
function IsWord($okWord){
$slen = strlen($okWord);
if($slen > $this->MaxLen) return false;
else return isset($this->RankDic[$slen][$okWord]);
}
//Organize the string (preliminary processing of punctuation marks, mixed Chinese and English, etc.)
function UpdateStr($str){
$ spc = $this->SplitChar;
$slen = strlen($str);
if($slen==0) return '';
$okstr = '';
$prechar = 0; // 0-blank 1-English 2-Chinese 3-Symbol
for($i=0;$i<$slen;$i++){
if(ord($str[$i]) < 0x81){
//Blank in English Symbol
if(ord($str[$i]) < 33){
if($prechar!=0&&$str[$i]!="r"&&$str[$i]!="n") $okstr .= $spc;
$prechar=0;
continue;
}else if(preg_match("/[^0-9a-zA-Z@.%#:\&_-]/",$str[$ i])){
if($prechar==0){ $okstr .= $str[$i]; $prechar=3;}
else{ $okstr .= $spc.$str[$i]; $ prechar=3;}
}else{
if($prechar==2||$prechar==3)
{ $okstr .= $spc.$str[$i]; $prechar=1;}
else
{
if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
else { $okstr .= $str [$i]; $prechar=1; }
}
}
}
else{
//If the previous character is non-Chinese and non-space, add a space
if($prechar!=0 && $prechar! =2) $okstr .= $spc;
//If Chinese characters
if(isset($str[$i+1])){
$c = $str[$i].$str[$i+1 ];
$n = hexdec(bin2hex($c));
if($n<0xA13F && $n > 0xAA40){
if($prechar!=0) $okstr .= $spc.$c;
else $okstr .= $c;
$prechar = 3;
}
else{
$okstr .= $c;
$prechar = 2;
}
$i++;
}
}
}
return $okstr ;
}
}
// Call
$split=new SplitWord();
echo $split->SplitRMM("php search technology");
// Note that the format of ppldic.csv dictionary is word + Space+Number+n

Copy code