Rmm 分词算法代码片段
Jun 08, 2016 pm 05:28 PMfunction SplitRMM($str=""){
if($str!="") $this->SetSource(trim($str));
if($this->SourceString=="") return "";
//对文本进行粗分
$this->SourceString = $this->ReviseString($this->SourceString);
//对特定文本进行分离
$spwords = explode(" ",$this->SourceString);
$spLen = count($spwords);
$spc = $this->SplitChar;
for($i=($spLen-1);$i>=0;$i--){
if(trim($spwords[$i])=="") continue;
if($this->NotGBK($spwords[$i])){
if(ereg("[^0-9.+-]",$spwords[$i]))
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else
{
$nextword = "";
@$nextword = substr($this->ResultString,0,strpos($this->ResultString," "));
if(ereg("^".$this->CommonUnit,$nextword)){
$this->ResultString = $spwords[$i].$this->ResultString;
}else{
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec(bin2hex($c));
if($c=="《") //书名
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else if($n>0xA13F && $n
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else //正常短句
{
if(strlen($spwords[$i]) SplitLen)
{
//如果结束符为特殊分割词,分离处理
if(ereg($this->EspecialChar."$",$spwords[$i],$regs)){
$spwords[$i] = ereg_replace($regs[0]."$","",$spwords[$i]).$spc.$regs[0];
}
//是否为常用单位
if(!ereg("^".$this->CommonUnit,$spwords[$i]) || $i==0){
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}else{
$this->ResultString = $spwords[$i-1].$spwords[$i].$spc.$this->ResultString;
$i--;
}
}
else
{
$this->ResultString = $this->RunRMM($spwords[$i]).$spc.$this->ResultString;
}
}
}
}
return $this->ResultString;
}

热门文章

热门文章

热门文章标签

记事本++7.3.1
好用且免费的代码编辑器

SublimeText3汉化版
中文版,非常好用

禅工作室 13.0.1
功能强大的PHP集成开发环境

Dreamweaver CS6
视觉化网页开发工具

SublimeText3 Mac版
神级代码编辑软件(SublimeText3)

OOBELANGUAGE错误Windows 11 / 10修复中出现问题的问题
