filetxt/filetxt/sentences.py
2025-09-19 13:20:39 +08:00

49 lines
1.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import spacy
nlp = None
# spacy.load("en_core_web_sm")
def get_nlp():
global nlp
if nlp is None:
config = getConfig()
nlp = spacy.load(config.spacy_model_path)
return nlp
def split_sentences(text: str):
"""
中英文混合断句:
- 英文部分:用 spaCy
- 中文部分:用正则,支持引号/全角符号
"""
# 英文先用 spaCy
nlp = get_nlp()
doc = nlp(text)
english_sentences = [sent.text.strip() for sent in doc.sents]
final_sentences = []
# 中文句子结束符号(含全角/半角)
chinese_splitter = re.compile(r"(.*?[。!?;!?…]+[”’」』》】)]?)")
for sent in english_sentences:
# 中文再细分
parts = chinese_splitter.findall(sent)
if parts:
final_sentences.extend([p.strip() for p in parts if p.strip()])
rest = chinese_splitter.sub("", sent).strip()
if rest:
final_sentences.append(rest)
else:
final_sentences.append(sent)
return final_sentences
if __name__ == '__main__':
# 示例
text = "小明说“今天下雨了”但Mr. Smith说: No. 5 road is dry. 还有一句……OK?"
print(split_sentences(text))