python - php查找算法

Question

一个文件有30w条数据！每行一个数据 同行词！比如post stop tops这样的是同行词你用什么办法把里面的所有这样数据找出来 求个思想

phpcn_u1582 · Answer

使用linux命令来完成需求。例子

统计文件夹下包含Action( 数量
grep Action\( ~/www/pms/app/app/controllers/*.php | wc -l

PHPz · Answer

我的建议是写一个特别的排序算法，然后用usort来排序，这样同行词都排在一起了，然后依序输出

排序算法大致的逻辑是

int cmp($left, $right) {
   //如果长度都不一致，直接放弃
   if(strlen($left) != strlen($right))
       return strcmp($left, $right);
   //长度一致的，按照字符切分，统计，判断是否一致
   $arrleft = str_split($left);
   $arrright = str_split($right);
   $leftstat = array();
   $rightstat = array();
   foreach($arrleft as $char) {
        if(array_key_exists($char, $leftstat))
            $leftstat[$char]++;
        else
            $leftstat[$char]=0;
   }
   foreach($arrright as $char) {
        //逻辑类似
   }
   //比较两个数组的统计是否一致
   if(count(array_diff_assoc($leftstat, $rightstat)) == 0)
       return 0;
   else
       return strcmp($left, $right);
}

1、排序30w行的数据，使用usort + 上面cmp函数

2、从第2行到尾遍历排序的数据，判断本行和上一行是否一致，是：输出，不是，向下走。

大概吧。随手写的

PHP中文网 · Answer

isStr = false;
        $this->next = [];
    }
}


///构建Tries Tree
class Helper
{
    public $treeRoot;

    public $debug = false;///此处开启是否以字符为索引

    public function __construct()
    {
        $this->treeRoot = new TreeNode();
    }

    /**
     * @param $str
     */
    public function insert($str)
    {
        $str = strtolower($str);///将所有的字符都作为小写存储

        $node = $this->treeRoot;
        for ($i = 0; $i < strlen($str); ++$i) {
            $index = $this->char2index($str{$i});
//            $index = $str{$i};
            if (empty($node->next[$index])) {
                $node->next[$index] = new TreeNode();
            }
            $node = $node->next[$index];
        }
        $node->isStr = true;
    }

    private function char2index($ch)
    {
        return ($this->debug) ? $ch : intval(ord($ch) - ord('a'));
    }

    private function index2char($index)
    {
        return ($this->debug) ? $index : chr($index + ord('a'));
    }

    /**
     * 查找对应的字符串的同形词
     * @param $str
     * @return array
     */
    public function find($str)
    {
        $result = [];

        $str = strtolower($str);///将所有的字符都作为小写存储

        $nextStr = ''; ///从后向前，逐渐追加字符，查找对应的数据
        for ($i = strlen($str) - 1; $i >= 0; --$i) {
            ///这里可以设置阈值，比如当需要找的字符串长度 > 2
            /// if(strlen($nextStr) < 2) continue;
            $nextStr = $str{$i} . $nextStr;

            $result = array_merge($result, $this->getResult($nextStr));
        }
        return array_unique($result);
    }

    /**
     * 找到对应字符串开头的所有单词
     * @param $str
     * @return array
     */
    private function getResult($str)
    {
        $result = [];
        $root = $this->treeRoot;

        ///先找到 tries 树中，对应的节点，确定节点是否包含子节点
        for ($i = 0; $i < strlen($str); ++$i) {
            if (empty($root)) {
                return $result;
            }
            $index = $this->char2index($str{$i});
            $root = $root->next[$index];
        }

        ///利用队列遍历Tries 树，实现 O(n) 检索
        $queue = new SplQueue();

        ///将节点，和字符起始点，记录到数据中，后续取用
        $next = ['node' => $root, 'str' => $str];
        $queue->push($next);

        while (!$queue->isEmpty()) {
            $next = $queue->pop();

            if ($next['node']->isStr) {///确定找到的是单词后，记录到结果集
                $result[] = $next['str'];
            }

            ///将下一个可能的结果集数组，放入到队列中查找
            if (!empty($next['node']->next)) {
                foreach ($next['node']->next as $index => $item) {
                    $next = ['node' => $item, 'str' => $next['str'] . $this->index2char($index)];
                    $queue->push($next);
                }
            }
        }
        return $result;
    }
}

$helper = new Helper();

$helper->insert("is");
$helper->insert("his");
$helper->insert("her");

$helper->insert('post');
$helper->insert('top');
$helper->insert('stop');


$result = $helper->find('post');
print_r($result);

$result = $helper->find('hi');
print_r($result);