-
- /**
- * Banned word filtering
- * Execution efficiency: 0.05 seconds per article
- * @author liuxu
- *
- */
- class Logic_BlackWord
- {
-
- const APP_FORUM = 1;
- const APP_BLOG = 2;
- const APP_VOTE = 3;
-
- /**
- * Filter to get banned words
- * @param unknown $txt
- * @return Ambigous
- */
- public function getHitList($txt)
- {
- $hitList = array();
-
- //对禁词分批过滤
- $max = $this->getMax();
- if($max)
- {
- $size = 1000;
- $last = ceil($max/$size);
- for($page=1;$page<=$last;$page++)
- {
- $result = $this->getHitListByPage($txt,$page,$size);
- if($result) $hitList = array_merge($hitList,$result);
- }
- }
-
- $hitList2 = array();
- foreach($hitList as $hit=>$type)
- {
- $hitList2[$type][] = $hit;
- }
-
- return $hitList2;
- }
-
- private function getMax()
- {
- $redis = Rds::factory();
- $memKey = 'blackWord_max';
- $max = $redis->get($memKey);
- if($max===false)
- {
- $max = 0;
- $blackWord = new Model_BlackWord_BlackWord();
- $para['field'] = "MAX(id) AS max";
- $result = $blackWord->search($para);
- if(isset($result[0]['max'])) $max = $result[0]['max'];
-
- $redis->setex($memKey,300,$max);
- }
-
- return $max;
- }
-
- /**
- * Filter in batches to get banned words
- * @param unknown $txt
- * @param number $page
- * @param number $size
- * @return multitype:Ambigous
- */
- private function getHitListByPage($txt,$page=1,$size=1000)
- {
- $hitList = array();
-
- //分批得到禁词树
- $wordTree = $this->getWordTreeByPage($page,$size);
-
- $txt = strip_tags($txt);
- $txt = preg_replace('/[^a-zA-Z0-9\x{4e00}-\x{9fa5}]/iu','',$txt);
-
- $len = mb_strlen($txt,'UTF-8');
- for($i=0;$i<$len;$i++)
- {
- $char = mb_substr($txt,$i,1,'UTF-8');
- if(isset($wordTree[$char]))
- {
- $result = $this->getHitListByTree(mb_substr($txt,$i,50,'UTF-8'),$wordTree);
- if($result)
- {
- foreach($result as $hit=>$type)
- {
- $hitList[$hit] = $type;
- }
- }
- }
- }
-
- return $hitList;
- }
-
- /**
- * Whether to ban words
- * @param str $txt
- * @param arr $wordTree
- * @return multitype:unknown
- */
- private function getHitListByTree($txt,&$wordTree)
- {
- $len = mb_strlen($txt,'UTF-8');
- $point = & $wordTree;
- $hit = '';
- $hitList = array();
- for($i=0;$i<$len;$i++)
- {
- $char = mb_substr($txt,$i,1,'UTF-8');
- if(isset($point[$char]))
- {
- $hit .= $char;
- $point = & $point[$char];
-
- if(isset($point['type']))//匹配成功
- {
- $hitList[$hit] = $point['type'];
- }
- }
- else
- {
- break;
- }
-
- }
-
- return $hitList;
- }
-
- /**
- * Get banned word trees in batches
- * @param int $page
- * @param int $size
- * @return arr:
- */
- private function getWordTreeByPage($page=1,$size=1000)
- {
- $redis = Rds::factory();
- $memKey = 'blackWord_tree_'.$page.'_'.$size;
- $wordTree = $redis->get($memKey);
- if($wordTree===false)
- {
- $wordTree = array();
- $blackWord = new Model_BlackWord_BlackWord();
- $start = ($page-1)*$size;
- $end = $start + $size;
- $para['where'] = "status=1 AND id>".$start." AND id<=".$end;
- $result = $blackWord->search($para);
- if($result)
- {
- foreach($result as $value)
- {
- if($value['word'])
- {
- $value['word'] = preg_split('/(? $point = & $wordTree;
- foreach($value['word'] as $char)
- {
- $point = & $point[$char];
- }
-
- $point['type'] = $value['type'];
- }
- }
- }
-
- $redis->setex($memKey,300,$wordTree);
- }
-
- return $wordTree;
- }
-
- }
-
-
复制代码
|