bugfix
This commit is contained in:
parent
5968a49099
commit
c4a2979887
48
filetxt/sentences.py
Normal file
48
filetxt/sentences.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import re
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
nlp = None
|
||||||
|
# spacy.load("en_core_web_sm")
|
||||||
|
|
||||||
|
def get_nlp():
|
||||||
|
global nlp
|
||||||
|
if nlp is None:
|
||||||
|
config = getConfig()
|
||||||
|
nlp = spacy.load(config.spacy_model_path)
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
def split_sentences(text: str):
|
||||||
|
"""
|
||||||
|
中英文混合断句:
|
||||||
|
- 英文部分:用 spaCy
|
||||||
|
- 中文部分:用正则,支持引号/全角符号
|
||||||
|
"""
|
||||||
|
# 英文先用 spaCy
|
||||||
|
nlp = get_nlp()
|
||||||
|
doc = nlp(text)
|
||||||
|
english_sentences = [sent.text.strip() for sent in doc.sents]
|
||||||
|
|
||||||
|
final_sentences = []
|
||||||
|
|
||||||
|
# 中文句子结束符号(含全角/半角)
|
||||||
|
chinese_splitter = re.compile(r"(.*?[。!?;!?…]+[”’」』》】)]?)")
|
||||||
|
|
||||||
|
for sent in english_sentences:
|
||||||
|
# 中文再细分
|
||||||
|
parts = chinese_splitter.findall(sent)
|
||||||
|
if parts:
|
||||||
|
final_sentences.extend([p.strip() for p in parts if p.strip()])
|
||||||
|
rest = chinese_splitter.sub("", sent).strip()
|
||||||
|
if rest:
|
||||||
|
final_sentences.append(rest)
|
||||||
|
else:
|
||||||
|
final_sentences.append(sent)
|
||||||
|
|
||||||
|
return final_sentences
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 示例
|
||||||
|
text = "小明说:“今天下雨了!”但Mr. Smith说: No. 5 road is dry. 还有一句……OK?"
|
||||||
|
print(split_sentences(text))
|
||||||
|
|
||||||
@ -1,27 +1,3 @@
|
|||||||
[project]
|
|
||||||
name="filetxt"
|
|
||||||
version = "0.0.1"
|
|
||||||
description = "load documents from file and get it's text"
|
|
||||||
authors = [{ name = "yu moqing", email = "yumoqing@gmail.com" }]
|
|
||||||
readme = "README.md"
|
|
||||||
requires-python = ">=3.8"
|
|
||||||
license = {text = "MIT"}
|
|
||||||
dependencies = [
|
|
||||||
"langchain_community",
|
|
||||||
"pillow",
|
|
||||||
"mobi",
|
|
||||||
"html2text",
|
|
||||||
"python-docx",
|
|
||||||
"python-pptx",
|
|
||||||
"openpyxl",
|
|
||||||
"pypdf",
|
|
||||||
"ebooklib ",
|
|
||||||
"beautifulsoup4",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.optional-dependencies]
|
|
||||||
dev = ["pytest", "black", "mypy"]
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools>=61", "wheel"]
|
requires = ["setuptools>=61", "wheel"]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|||||||
24
setup.cfg
Normal file
24
setup.cfg
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
[metadata]
|
||||||
|
name=filetxt
|
||||||
|
version = 0.0.2
|
||||||
|
description = a document tools for loading document from file and split sentences
|
||||||
|
author = "yu moqing"
|
||||||
|
author_email = "yumoqing@gmail.com"
|
||||||
|
readme = "README.md"
|
||||||
|
license = "MIT"
|
||||||
|
[options]
|
||||||
|
packages = find:
|
||||||
|
requires_python = ">=3.8"
|
||||||
|
install_requires =
|
||||||
|
spacy
|
||||||
|
langchain_community,
|
||||||
|
pillow,
|
||||||
|
mobi,
|
||||||
|
html2text,
|
||||||
|
python-docx,
|
||||||
|
python-pptx,
|
||||||
|
openpyxl,
|
||||||
|
pypdf,
|
||||||
|
ebooklib,
|
||||||
|
beautifulsoup4,
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user