This program is a simple Chinese word segmentation based on the idea of RMM Chinese word segmentation. There are still many loopholes in the program. I hope God can give me some guidance.... The garbled code problem has been optimized.
- /**
- * Based on RMM Chinese word segmentation (reverse matching method)
- * @author tangpan
- * @date 2013-10-12
- * @version 1.0.0
- **/
- class SplitWord {
- //public $Tag_dic = array(); //Storage dictionary word segmentation
- public $Rec_dic = array(); //Storage reorganization Word segmentation
- public $Split_char = ' '; //Separator
- public $Source_str = ''; //Storage source string
- public $Result_str = ''; //Storage word segmentation result string
- public $limit_lenght = 2;
- public $Dic_maxLen = 28; //The maximum length of words in the dictionary
- public $Dic_minLen = 2; //The minimum length of words in the dictionary
-
- public function SplitWord() { //Initialize the object and automatically execute member methods
- $ this->__construct();
- }
- public function __construct() {
- $dic_path = dirname(__FILE__).'/words.csv'; //Preload the dictionary to improve word segmentation speed
- $fp = fopen( $ dic_path, 'r' ); //Read the words in the vocabulary
- while( $line = fgets( $fp, 256 ) ) {
- $ws = explode(' ', $line); //Read the words in the vocabulary Split the words
- $ws[0] = trim(iconv('utf-8','GBK',$ws[0])); //Encoding conversion
- //$this->Tag_dic[$ws[ 0]] = true; //Use word as index, sequence number as value
- $this->Rec_dic[strlen($ws[0])][$ws[0]] = true; //Use word length and word They are the indexes of the two-dimensional array respectively, using n as the value to reorganize the vocabulary
- }
- fclose($fp); //Close the vocabulary
- }
-
- /**
- * Set the source string
- * @param The string to be segmented
- */
- public function SetSourceStr( $str ) {
- $str = iconv( 'utf-8', 'GBK', $str ); // Convert utf-8 encoded characters to GBK encoding
- $this->Source_str = $this->DealStr( $ str ); //Preliminary processing of string
- }
-
- /**
- * Check string
- * @param $str source string
- * @return bool
- */
- public function checkStr( $str ) {
- if ( trim($str) == '' ) return; //If string If it is empty, return directly
- if ( ord( $str[0] ) > 0x80 ) return true; //If it is a Chinese character, return true
- else return false; //If it is not a Chinese character, return false
- }
-
-
- / **
- * RMM word segmentation algorithm
- * @param $str String to be processed
- */
- public function SplitRMM( $str = '' ) {
- if ( trim( $str ) == '' ) return; //If the string is empty, return directly
- else $this- >SetSourceStr( $str ); //When the string is not empty, set the source string
- if ( $this->Source_str == ' ' ) return; //When the source string is empty, return directly
- $split_words = explode( ' ', $this->Source_str ); //Split the string with spaces
- $lenght = count( $split_words ); //Calculate the length of the array
- for ( $i = $lenght - 1 ; $i >= 0; $i-- ) {
- if ( trim( $split_words[$i] ) == ' ' ) continue; //If the character is empty, skip the following code and go directly to the next One loop
- if ( $this->checkStr( $split_words[$i] ) ) { //Check the string, if it is a Chinese character
- if ( strlen( $split_words[$i] ) >= $this-> ;limit_lenght ) { //When the string length is greater than the limit size
- //Reverse match the string
- $this->Result_str = $this->pregRmmSplit( $split_words[$i] ).$this-> Split_char.$this->Result_str;
- }
- } else {
- $this->Result_str = $split_words[$i].$this->Split_char.$this->Result_str;
- }
- }
- $ this->clear( $split_words ); //Release memory
- return iconv('GBK', 'utf-8', $this->Result_str);
- }
-
- /**
- * Decompose the Chinese string using reverse matching method
- * @param $str string
- * @return $retStr The string that has been segmented into words
- */
- public function pregRmmSplit( $str ) {
- if ( $str == ' ' ) return;
- $splen = strlen( $str );
- $Split_Result = array();
- for ( $j = $splen - 1; $j >= 0; $j--) { //Match characters in reverse
- if ( $splen <= $this->Dic_minLen ) { //When the character length is greater than the minimum character length in the dictionary
- if ( $j = = 1 ) { //When the length is 1
- $Split_Result[] = substr( $str, 0, 2 );
- }else {
- $w = trim( substr( $str, 0, $this->Dic_minLen + 1 ) ); //Truncate the first four characters
- if ( $this->IsWord( $w ) ) { // Determine whether the character exists in the dictionary
- $Split_Result[] = $w; //If it exists, write it to the array for storage
- } else {
- $Split_Result[] = substr( $str, 2, 2); //Reverse storage
- $Split_Result[] = substr( $str, 0, 2 );
- }
- }
- $j = -1; //Close the loop;
- break;
- }
- if ( $j >= $this->Dic_maxLen ) $max_len = $this->Dic_maxLen; //When the character length is greater than the length of the largest word in the dictionary, assign the maximum limit length
- else $max_len = $j;
- for ( $k = $max_len; $k >= 0; $k = $k - 2 ) { //One jump is one Chinese character
- $w = trim( substr( $str, $j - $k, $k + 1 ) );
- if ( $this-> ;IsWord( $w ) ) {
- $Split_Result[] = $w; //Save the word
- $j = $j - $k - 1; //Move the position to the position of the matched character
- break; // If the word segmentation is successful, it will jump out of the current loop and enter the next loop
- }
- }
- }
- $retStr = $this->resetWord( $Split_Result ); //Reorganize the string and return the processed string
- $this-> ;clear( $Split_Result ); //Release memory
- return $retStr;
- }
-
- /**
- * Re-identify and combine word segments
- * @param $Split_Result Restructure the target string
- * @return $ret_Str Restructure the string
- */
- public function resetWord( $Split_Result ) {
- if ( trim( $Split_Result[0] ) == '' ) return;
- $Len = count( $Split_Result ) - 1;
- $ret_Str = '';
- $spc = $this->Split_char;
- for ( $i = $Len; $i >= 0 ; $i-- ) {
- if ( trim( $Split_Result[$i] ) != '' ) {
- $Split_Result[$i] = iconv( 'GBK', 'utf-8', $Split_Result[$i ] );
- $ret_Str .= $spc.$Split_Result[$i].' ';
- }
- }
- //$ret_Str = preg_replace('/^'.$spc.'/',',',$ ret_Str);
- $ret_Str = iconv('utf-8','GBK',$ret_Str);
- return $ret_Str;
- }
-
- /**
- * Check if a word exists in the dictionary
- * @param $okWord The word to check
- * @return bool;
- */
- public function IsWord( $okWord ) {
- $len = strlen( $okWord );
- if ( $len > $this->Dic_maxLen + 1 ) return false;
- else { //According to the two-dimensional array index matching, whether the word exists
- return isset($this ->Rec_dic[$len][$okWord]);
- }
-
- }
-
- /**
- * Preliminary string processing (replacing special characters with spaces)
- * @param $str The source string to be processed
- * @return $okStr Returns the preprocessed string
- */
- public function DealStr( $str ) {
- $spc = $this->Split_char; / /Copy separator
- $slen = strlen( $str ); // Calculate the length of the character
- if ( $slen == 0 ) return; // If the character length is 0, return directly
- $okstr = ''; // Initialization variable
- $prechar = 0; //Character judgment variable (0-blank, 1-English, 2-Chinese, 3-symbol)
- for ( $i = 0; $i < $slen; $i++ ) {
- $str_ord = ord( $str[$i] );
- if ( $str_ord < 0x81 ) { //If it is an English character
- if ( $str_ord < 33 ) { // English blank symbol
- if ( $str [$i] != 'r' && $str[$i] != 'n' )
- $okstr .= $spc;
- $prechar = 0;
- continue;
- } else if ( ereg('[@. %#:^&_-]',$str[$i]) ) { //If the character of the keyword is a number or English or special character
- if ( $prechar == 0 ) { //When the character is a blank character
- $okstr .= $str[$i];
- $prechar = 3;
- } else {
- $okstr .= $spc.$str[$i]; //When the character is not a blank character, string in front of the character Top white space character
- $prechar = 3;
- }
- } else if ( ereg('[0-9a-zA-Z]', $str[$i]) ) { //Split English number combinations
- if ( (ereg ('[0-9]',$str[$i-1]) && ereg('[a-zA-Z]',$str[$i]))
- || (ereg('[a-zA -Z]',$str[$i-1]) && ereg('[0-9]',$str[$i])) ) {
- $okstr .= $spc.$str[$i];
- } else {
- $okstr .= $str[$i];
- }
- }
- }else { //If the second character of the keyword is a Chinese character
- if ( $prechar != 0 && $prechar != 2 ) // If the previous character is non-Chinese and non-space, add a space
- $okstr . = $spc;
- if ( isset( $str[$i+1] ) ) { //If it is a Chinese character
- $c = $str[$i].$str[$i+1]; //Convert the two characters strings together to form a Chinese character
- $n = hexdec( bin2hex( $c ) ); //Convert the ascii code to hexadecimal and then to decimal
- if ( $n > 0xA13F && $ n < 0xAA40 ) { //If it is a Chinese punctuation mark
- if ( $prechar != 0 ) $okstr .= $spc; //Replace the Chinese punctuation mark with an empty mark
- //else $okstr .= $spc; // If the previous character is empty, string it directly
- $prechar = 3;
- } else { //If it is not Chinese punctuation
- $okstr .= $c;
- $prechar = 2;
- }
- $i++; // $ i plus 1, even if it moves to one Chinese character at a time
- }
- }
- }
- return $okstr;
- }
-
- /**
- * Release memory
- * @param $data Temporary data
- */
- public function clear( $data ) {
- unset( $data ) ; //Delete temporary data
- }
- }
- ?>
Copy code
|