From c4a2979887721f89de99d27e5734154f59f68e69 Mon Sep 17 00:00:00 2001 From: yumoqing Date: Fri, 19 Sep 2025 13:20:39 +0800 Subject: [PATCH] bugfix --- filetxt/sentences.py | 48 ++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 24 ---------------------- setup.cfg | 24 ++++++++++++++++++++++ 3 files changed, 72 insertions(+), 24 deletions(-) create mode 100644 filetxt/sentences.py create mode 100644 setup.cfg diff --git a/filetxt/sentences.py b/filetxt/sentences.py new file mode 100644 index 0000000..97248cd --- /dev/null +++ b/filetxt/sentences.py @@ -0,0 +1,48 @@ +import re +import spacy + +nlp = None +# spacy.load("en_core_web_sm") + +def get_nlp(): + global nlp + if nlp is None: + config = getConfig() + nlp = spacy.load(config.spacy_model_path) + return nlp + +def split_sentences(text: str): + """ + 中英文混合断句: + - 英文部分:用 spaCy + - 中文部分:用正则,支持引号/全角符号 + """ + # 英文先用 spaCy + nlp = get_nlp() + doc = nlp(text) + english_sentences = [sent.text.strip() for sent in doc.sents] + + final_sentences = [] + + # 中文句子结束符号(含全角/半角) + chinese_splitter = re.compile(r"(.*?[。!?;!?…]+[”’」』》】)]?)") + + for sent in english_sentences: + # 中文再细分 + parts = chinese_splitter.findall(sent) + if parts: + final_sentences.extend([p.strip() for p in parts if p.strip()]) + rest = chinese_splitter.sub("", sent).strip() + if rest: + final_sentences.append(rest) + else: + final_sentences.append(sent) + + return final_sentences + + +if __name__ == '__main__': + # 示例 + text = "小明说:“今天下雨了!”但Mr. Smith说: No. 5 road is dry. 还有一句……OK?" + print(split_sentences(text)) + diff --git a/pyproject.toml b/pyproject.toml index a5b6629..59514a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,27 +1,3 @@ -[project] -name="filetxt" -version = "0.0.1" -description = "load documents from file and get it's text" -authors = [{ name = "yu moqing", email = "yumoqing@gmail.com" }] -readme = "README.md" -requires-python = ">=3.8" -license = {text = "MIT"} -dependencies = [ - "langchain_community", - "pillow", - "mobi", - "html2text", - "python-docx", - "python-pptx", - "openpyxl", - "pypdf", - "ebooklib ", - "beautifulsoup4", -] - -[project.optional-dependencies] -dev = ["pytest", "black", "mypy"] - [build-system] requires = ["setuptools>=61", "wheel"] build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..0044f56 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,24 @@ +[metadata] +name=filetxt +version = 0.0.2 +description = a document tools for loading document from file and split sentences +author = "yu moqing" +author_email = "yumoqing@gmail.com" +readme = "README.md" +license = "MIT" +[options] +packages = find: +requires_python = ">=3.8" +install_requires = + spacy + langchain_community, + pillow, + mobi, + html2text, + python-docx, + python-pptx, + openpyxl, + pypdf, + ebooklib, + beautifulsoup4, +