php环境,指出一个原本分句的时候的误区,分句不用考虑小数点,不用考虑域名,因为标准的句子是句号后面加空格的,唯一要考虑的就是Mr. Li 这种。 先采用分段落的方式是考虑到有些引用采用冒号结尾。
- /*TWWY'S ART*/
- function break_passage($text){ //分割段落
- return preg_split("/(r|n|rn)/", $text, -1, PREG_SPLIT_NO_EMPTY);
- }
- function break_sentence($text){ //分割句子 英文的句号后面必须有空格
- $re = '/# Split sentences on whitespace between them.
- (?<= # Begin positive lookbehind.
- [.!?] # Either an end of sentence punct,
- | [.!?]['"] # or end of sentence punct and quote.
- ) # End positive lookbehind.
- (? Mr. # Skip either "Mr."
- | Mrs. # or "Mrs.",
- | Ms. # or "Ms.",
- | Jr. # or "Jr.",
- | Dr. # or "Dr.",
- | Prof. # or "Prof.",
- | Sr. # or "Sr.",
- # or... (you get the idea).
- ) # End negative lookbehind.
- s # Split on whitespace between sentences.
- /ix';
- $sentences = preg_split($re, $text, -1, PREG_SPLIT_NO_EMPTY);
- return $sentences;
- }
- function get_sentence($text){ //先分割段落再分割句子 [推荐]
- $passage = break_passage($text);
- $return = array();
- foreach ($passage as $key => $value) $return = array_merge($return, break_sentence($value));
- return $return;
- }
-
- ?>
复制代码
|
本網站聲明
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn
作者最新文章
-
2024-10-22 09:46:29
-
2024-10-13 13:53:41
-
2024-10-12 12:15:51
-
2024-10-11 22:47:31
-
2024-10-11 19:36:51
-
2024-10-11 15:50:41
-
2024-10-11 15:07:41
-
2024-10-11 14:21:21
-
2024-10-11 12:59:11
-
2024-10-11 12:17:31