This commit is contained in:
yumoqing 2025-09-19 13:20:39 +08:00
parent 5968a49099
commit c4a2979887
3 changed files with 72 additions and 24 deletions

48
filetxt/sentences.py Normal file
View File

@ -0,0 +1,48 @@
import re
import spacy
nlp = None
# spacy.load("en_core_web_sm")
def get_nlp():
global nlp
if nlp is None:
config = getConfig()
nlp = spacy.load(config.spacy_model_path)
return nlp
def split_sentences(text: str):
"""
中英文混合断句
- 英文部分 spaCy
- 中文部分用正则支持引号/全角符号
"""
# 英文先用 spaCy
nlp = get_nlp()
doc = nlp(text)
english_sentences = [sent.text.strip() for sent in doc.sents]
final_sentences = []
# 中文句子结束符号(含全角/半角)
chinese_splitter = re.compile(r"(.*?[。!?;!?…]+[”’」』》】)]?)")
for sent in english_sentences:
# 中文再细分
parts = chinese_splitter.findall(sent)
if parts:
final_sentences.extend([p.strip() for p in parts if p.strip()])
rest = chinese_splitter.sub("", sent).strip()
if rest:
final_sentences.append(rest)
else:
final_sentences.append(sent)
return final_sentences
if __name__ == '__main__':
# 示例
text = "小明说“今天下雨了”但Mr. Smith说: No. 5 road is dry. 还有一句……OK?"
print(split_sentences(text))

View File

@ -1,27 +1,3 @@
[project]
name="filetxt"
version = "0.0.1"
description = "load documents from file and get it's text"
authors = [{ name = "yu moqing", email = "yumoqing@gmail.com" }]
readme = "README.md"
requires-python = ">=3.8"
license = {text = "MIT"}
dependencies = [
"langchain_community",
"pillow",
"mobi",
"html2text",
"python-docx",
"python-pptx",
"openpyxl",
"pypdf",
"ebooklib ",
"beautifulsoup4",
]
[project.optional-dependencies]
dev = ["pytest", "black", "mypy"]
[build-system] [build-system]
requires = ["setuptools>=61", "wheel"] requires = ["setuptools>=61", "wheel"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

24
setup.cfg Normal file
View File

@ -0,0 +1,24 @@
[metadata]
name=filetxt
version = 0.0.2
description = a document tools for loading document from file and split sentences
author = "yu moqing"
author_email = "yumoqing@gmail.com"
readme = "README.md"
license = "MIT"
[options]
packages = find:
requires_python = ">=3.8"
install_requires =
spacy
langchain_community,
pillow,
mobi,
html2text,
python-docx,
python-pptx,
openpyxl,
pypdf,
ebooklib,
beautifulsoup4,