49 lines
1.2 KiB
Python
49 lines
1.2 KiB
Python
import re
|
||
import spacy
|
||
|
||
nlp = None
|
||
# spacy.load("en_core_web_sm")
|
||
|
||
def get_nlp():
|
||
global nlp
|
||
if nlp is None:
|
||
config = getConfig()
|
||
nlp = spacy.load(config.spacy_model_path)
|
||
return nlp
|
||
|
||
def split_sentences(text: str):
|
||
"""
|
||
中英文混合断句:
|
||
- 英文部分:用 spaCy
|
||
- 中文部分:用正则,支持引号/全角符号
|
||
"""
|
||
# 英文先用 spaCy
|
||
nlp = get_nlp()
|
||
doc = nlp(text)
|
||
english_sentences = [sent.text.strip() for sent in doc.sents]
|
||
|
||
final_sentences = []
|
||
|
||
# 中文句子结束符号(含全角/半角)
|
||
chinese_splitter = re.compile(r"(.*?[。!?;!?…]+[”’」』》】)]?)")
|
||
|
||
for sent in english_sentences:
|
||
# 中文再细分
|
||
parts = chinese_splitter.findall(sent)
|
||
if parts:
|
||
final_sentences.extend([p.strip() for p in parts if p.strip()])
|
||
rest = chinese_splitter.sub("", sent).strip()
|
||
if rest:
|
||
final_sentences.append(rest)
|
||
else:
|
||
final_sentences.append(sent)
|
||
|
||
return final_sentences
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 示例
|
||
text = "小明说:“今天下雨了!”但Mr. Smith说: No. 5 road is dry. 还有一句……OK?"
|
||
print(split_sentences(text))
|
||
|