bugfix
This commit is contained in:
parent
5968a49099
commit
c4a2979887
48
filetxt/sentences.py
Normal file
48
filetxt/sentences.py
Normal file
@ -0,0 +1,48 @@
|
||||
import re
|
||||
import spacy
|
||||
|
||||
nlp = None
|
||||
# spacy.load("en_core_web_sm")
|
||||
|
||||
def get_nlp():
|
||||
global nlp
|
||||
if nlp is None:
|
||||
config = getConfig()
|
||||
nlp = spacy.load(config.spacy_model_path)
|
||||
return nlp
|
||||
|
||||
def split_sentences(text: str):
|
||||
"""
|
||||
中英文混合断句:
|
||||
- 英文部分:用 spaCy
|
||||
- 中文部分:用正则,支持引号/全角符号
|
||||
"""
|
||||
# 英文先用 spaCy
|
||||
nlp = get_nlp()
|
||||
doc = nlp(text)
|
||||
english_sentences = [sent.text.strip() for sent in doc.sents]
|
||||
|
||||
final_sentences = []
|
||||
|
||||
# 中文句子结束符号(含全角/半角)
|
||||
chinese_splitter = re.compile(r"(.*?[。!?;!?…]+[”’」』》】)]?)")
|
||||
|
||||
for sent in english_sentences:
|
||||
# 中文再细分
|
||||
parts = chinese_splitter.findall(sent)
|
||||
if parts:
|
||||
final_sentences.extend([p.strip() for p in parts if p.strip()])
|
||||
rest = chinese_splitter.sub("", sent).strip()
|
||||
if rest:
|
||||
final_sentences.append(rest)
|
||||
else:
|
||||
final_sentences.append(sent)
|
||||
|
||||
return final_sentences
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 示例
|
||||
text = "小明说:“今天下雨了!”但Mr. Smith说: No. 5 road is dry. 还有一句……OK?"
|
||||
print(split_sentences(text))
|
||||
|
||||
@ -1,27 +1,3 @@
|
||||
[project]
|
||||
name="filetxt"
|
||||
version = "0.0.1"
|
||||
description = "load documents from file and get it's text"
|
||||
authors = [{ name = "yu moqing", email = "yumoqing@gmail.com" }]
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
license = {text = "MIT"}
|
||||
dependencies = [
|
||||
"langchain_community",
|
||||
"pillow",
|
||||
"mobi",
|
||||
"html2text",
|
||||
"python-docx",
|
||||
"python-pptx",
|
||||
"openpyxl",
|
||||
"pypdf",
|
||||
"ebooklib ",
|
||||
"beautifulsoup4",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = ["pytest", "black", "mypy"]
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=61", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
24
setup.cfg
Normal file
24
setup.cfg
Normal file
@ -0,0 +1,24 @@
|
||||
[metadata]
|
||||
name=filetxt
|
||||
version = 0.0.2
|
||||
description = a document tools for loading document from file and split sentences
|
||||
author = "yu moqing"
|
||||
author_email = "yumoqing@gmail.com"
|
||||
readme = "README.md"
|
||||
license = "MIT"
|
||||
[options]
|
||||
packages = find:
|
||||
requires_python = ">=3.8"
|
||||
install_requires =
|
||||
spacy
|
||||
langchain_community,
|
||||
pillow,
|
||||
mobi,
|
||||
html2text,
|
||||
python-docx,
|
||||
python-pptx,
|
||||
openpyxl,
|
||||
pypdf,
|
||||
ebooklib,
|
||||
beautifulsoup4,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user