This commit is contained in:
yumoqing 2025-07-28 18:15:10 +08:00
parent 47a4a2f606
commit cd8e02687d
29 changed files with 1 additions and 1116 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,57 +0,0 @@
import os
from datetime import datetime
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from appPublic.log import debug
from appPublic.uniqueID import getID
from get_vector_db import get_vector_db
TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')
# Function to check if the uploaded file is allowed (only PDF files)
def allowed_file(filename):
allowed_file_subffix = ['pdf','doc', 'docx','xlsx', 'xls', 'ppt', 'pptx', 'csv', 'txt']
return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_file_subffix
# Function to load and split the data from the PDF file
def load_and_split_data(file_path):
# Load the PDF file and split the data into chunks
data = None
if file_path.lower().endswith('.pdf'):
loader = UnstructuredPDFLoader(file_path=file_path)
elif file_path.lower().endswith('.docx') or file_path.lower().endswith('.doc'):
loader = UnstructuredWordDocumentLoader(file_path=file_path)
elif file_path.lower().endswith('.pptx') or file_path.lower().endswith('.pptx'):
loader = UnstructuredPowerPointLoader(file_path=file_path)
elif file_path.lower().endswith('.xlsx') or file_path.lower().endswith('.xls'):
loader = UnstructuredExcelLoader(file_path=file_path)
elif file_path.lower().endswith('.csv'):
loader = CSVLoader(file_path=file_path)
else:
loader = TextLoader(file_path=file_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)
return chunks
# Main function to handle the embedding process
def embed(file_path, userid, kdbname):
if allowed_file(file_path):
chunks = load_and_split_data(file_path)
debug(f'{chunks=}')
db = get_vector_db(userid, kdbname)
db.add(
documents=[c.page_content for c in chunks],
metadatas=[c.metadata for c in chunks],
ids=[getID() for c in chunks]
)
return True
return False

View File

View File

@ -1,22 +0,0 @@
from ahserver.serverenv import ServerEnv
from ahserver.configuredServer import ConfiguredServer
from ahserver.webapp import webapp
from appPublic.worker import awaitify
from filemgr.init import load_filemgr
from rbac.init import load_rbac
from appbase.init import load_appbase
from rag.init import load_rag
def get_module_dbname(name):
return 'sage'
def init():
load_rag()
load_appbase()
load_filemgr()
env = ServerEnv()
env.get_module_dbname = get_module_dbname
if __name__ == '__main__':
webapp(init)

Binary file not shown.

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

View File

@ -1 +0,0 @@
开元云北京科技有限公司是一家注册于2020年的高科技企业在上海、南京、深圳、济南等地设有分支机构创始团队核心成员来自一流的云计算公司及电信运营商拥有云计算、超算、智算和网络运营专业经验在企业市场均拥有超过十年以上行业经验服务客户超过2万家。公司以自主研发的业务操作支撑系统KBoss为底座打造开放算力应用服务平台open-computing将云计算、算力资源和算力应用进行整合为高校、科研、大模型、AI等政企客户提供专业算力云服务形成“云+网+算+应用”的一体化解决方案。在2021年我们荣幸地成为阿里云计算的合作伙伴致力于提供算力应用、算力网络、算网一体的产品和服务同时为芯片、教育科研等企业提供优质的算力服务。2022年我们与国家超级计算济南中心以及中信网络有限公司签署了战略合作协议并成功推出了“Kboss”算网平台。在2023年我们的平台进一步发展成功引入火山引擎、百度智能云。目前我们已成为阿里云、江苏未来网络集团的战略合作伙伴。同时我们深耕“算力+教育”赛道持续推进高校算力平台项目积极建设学校算力网络节点目前已经成功开拓了27所高校。公司提供新一代算力云应用服务模式通过自主研发的开元算力云应用服务平台整合算力资源和算法应用利用创新算力调度化和确定性网络技术针对现代社会对智能化和数字化需求形成包括算力云服务、算力网络和算力应用的全场景解决方案。旨在为政府和企业提供"技术+资源+场景+运营”的产业互联网算力云应用服务平台实现以算力云服务推动数字经济的发展。开元云科技自成立以来得到了包括工信部、教育部、全国高校学会、国家超算中心以及南京未来网络研究院等政府机构、科研机构的大力支持合作领域包括“东数西算、大科学计算、存算分离、芯算一体及国产工业软件SaaS化”覆盖人工智能、芯片仿真、生物制药、工业仿真、材料研发、精尖制造、海洋勘探以及气象监测等高科技领域。

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,129 +0,0 @@
Metadata-Version: 2.4
Name: rag
Version: 0.0.1
Summary: rag
Home-page: https://github.com/yumoqing/rag
Author: yumoqing
Author-email: yumoqing@gmail.com
Platform: any
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Description-Content-Type: text/markdown
Requires-Dist: chromadb
Requires-Dist: langchain
Requires-Dist: langchain_community
Requires-Dist: unstructured
Requires-Dist: langchain-text-splitters
Requires-Dist: unstructured[all-docs]
Requires-Dist: langchain_milvus
Requires-Dist: langchain_huggingface
Requires-Dist: transformers
Requires-Dist: openai
Requires-Dist: torch
Requires-Dist: torchvision
Requires-Dist: pymilvus
Dynamic: author
Dynamic: author-email
Dynamic: classifier
Dynamic: description
Dynamic: description-content-type
Dynamic: home-page
Dynamic: platform
Dynamic: requires-dist
Dynamic: summary
# 知识库服务器
本系统为不同的客户提供自我管理的知识库,并在知识库基础上提供知识检索
本系统提供API形式为注册的服务器提供知识服支持不面向最终客户
## 依赖
依赖[这些模块](requirements.txt)
## 安装部署
1. 创建rag用户
2. 登录rag用户
3. 执行以下命令
```
git clone git@git.kaiyuancloud.cn:yumoqing/rag
cd rag/script
./install.sh
```
将项目在用户根目录checkout
3.
## 功能
管理client系统的客户知识库并提供知识查询
每个客户可以创建一到多个独立的知识库,为不同的业务场景提供知识库知识
知识库之间数据相互独立,互不干扰。
## http API
### add
增加知识库文档
#### path
/api/add
#### method
POST
#### 输入
name: authentication
value: Bears ${apikey}
score: headers
name: file_name
value: path of uploaded file
score: data
name: userid
value: userid of client system
score: data
name: kdbname
value: rag kdb name
score: data
#### 输出
### query
查询知识库
#### path
/api/query
#### method
POST
#### 输入
name: authentication
value: Bears ${apikey}
score: headers
name: prompt
value: ${prompt}
score: data
name: userid
value: ${userid}
score: data
name: kdbname
value: ${kdbname}
score: data
#### 输出
```
{
total:返回记录条数,
rows返回记录内容
}
rows有以下属性
content文本内容
distances距离
source文档path
```

View File

@ -1,16 +0,0 @@
README.md
setup.py
rag/__init__.py
rag/deletefile.py
rag/embed.py
rag/init.py
rag/kdb.py
rag/query.py
rag/rag.bak.py
rag/vector.py
rag/version.py
rag.egg-info/PKG-INFO
rag.egg-info/SOURCES.txt
rag.egg-info/dependency_links.txt
rag.egg-info/requires.txt
rag.egg-info/top_level.txt

View File

@ -1 +0,0 @@

View File

@ -1,13 +0,0 @@
chromadb
langchain
langchain_community
unstructured
langchain-text-splitters
unstructured[all-docs]
langchain_milvus
langchain_huggingface
transformers
openai
torch
torchvision
pymilvus

View File

@ -1 +0,0 @@
rag

View File

@ -1,15 +1,7 @@
from appPublic.worker import awaitify from appPublic.worker import awaitify
from ahserver.serverenv import ServerEnv from ahserver.serverenv import ServerEnv
from .kdb import add_kdb, add_dir, add_doc, get_all_docs
from .query import search_query
from .embed import embed
def load_rag(): def load_rag():
env = ServerEnv() env = ServerEnv()
env.add_kdb = add_kdb
env.query = awaitify(search_query)
env.embed = awaitify(embed)
env.add_dir = add_dir
env.add_doc = add_doc
env.get_all_docs = get_all_docs

View File

@ -1,52 +0,0 @@
#!/bin/bash
# 检查操作系统
OS=$(uname -s)
if [[ "$OS" != "Darwin" && "$OS" != "Linux" ]]; then
echo "错误:此脚本仅支持 macOS 和 Linux"
exit 1
fi
# 检查依赖文件
SERVICE_FILE="rag.service"
NGINX_FILE="rag.nginx"
if [[ ! -f "$SERVICE_FILE" || ! -f "$NGINX_FILE" ]]; then
echo "错误:缺少 $SERVICE_FILE$NGINX_FILE 文件"
exit 1
fi
# 1. 配置服务
if [[ "$OS" == "Darwin" ]]; then
# macOS: 使用 launchd
mkdir -p ~/Library/LaunchAgents
cp rag.service ~/Library/LaunchAgents/
launchctl load ~/Library/LaunchAgents/rag.service
launchctl start rag.service
elif [[ "$OS" == "Linux" ]]; then
# Linux: 使用 Systemd
sudo cp rag.service /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable rag.service
sudo systemctl start rag.service
fi
# 2. 配置 Nginx
if ! command -v nginx &> /dev/null; then
echo "安装 Nginx..."
if [[ "$OS" == "Darwin" ]]; then
brew install nginx
elif [[ "$OS" == "Linux" ]]; then
sudo apt-get update && sudo apt-get install -y nginx
fi
fi
# 动态检测 Nginx 配置路径
NGINX_CONF_DIR="/etc/nginx/sites-enabled"
if [[ "$OS" == "Darwin" ]]; then
NGINX_CONF_DIR="/usr/local/etc/nginx/sites-enabled"
fi
mkdir -p "$NGINX_CONF_DIR"
cp rag.nginx "$NGINX_CONF_DIR/"
nginx -t && nginx -s reload || echo "错误Nginx 配置重载失败"
echo "安装完成!"

View File

@ -1,20 +0,0 @@
#!/bin/sh
if [ -z "$1" ]; then
echo "错误:请提供进程名称"
exit 1
fi
# 查找进程并终止
PIDS=$(ps -ef | grep "$1" | grep -v grep | awk '{print $2}')
if [ -z "$PIDS" ]; then
echo "未找到匹配的进程:$1"
exit 0
fi
for PID in $PIDS; do
echo "终止进程 $PID"
kill -9 "$PID"
done
exit 0

View File

@ -1,31 +0,0 @@
server {
listen 80;
server_name rag.opencomputing.cn;
autoindex on;
client_max_body_size 20m;
proxy_set_header X-Forwarded-Host $host;
proxy_set_header X-Forwarded-server $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Scheme $scheme;
proxy_set_header X-Forwarded-Port $server_port;
proxy_set_header X-Forwarded-Url "$scheme://$host:$server_port$request_uri";
index index.html index.htm;
location ~^/ip$ {
return 200 "$remote_addr";
}
location / {
add_header Access-Control-Allow-Origin *;
add_header Access-Control-Allow-Origin *;
proxy_set_header X-Forwarded-Host $host;
proxy_set_header X-Forwarded-server $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Scheme $scheme;
proxy_set_header X-Forwarded-Port $server_port;
proxy_set_header X-real-ip $remote_addr;
proxy_send_timeout 600s;
proxy_read_timeout 600s;
proxy_pass http://localhost:10098/;
}
}

View File

@ -1,19 +0,0 @@
[Unit]
Description=RAG Service
Documentation=RAG service to control RAG application
After=network.target nginx.service
Requires=nginx.service
[Service]
User=wangmeihua
Group=wangmeihua
# Type=forking
User=wangmeihua
WorkingDirectory=/share/wangmeihua/rag
ExecStart=/bin/bash /share/wangmeihua/rag/script/rag.sh
ExecStop=/bin/bash /share/wangmeihua/rag/script/killname app/ragapp.py
Restart=on-failure
StandardOutput=append:/var/log/rag/rag.log
StandardError=append:/var/log/rag/error.log
[Install]
WantedBy=multi-user.target

View File

@ -1,18 +0,0 @@
#!/bin/bash
User=wangmeihua
Group=wangmeihua
PYTHON=python3
RAG_PY="/d/wangmeihua/rag/app/ragapp.py"
LOG_DIR="/d/wangmeihua/rag/logs"
# 验证文件存在
if [[ ! -f "$RAG_PY" ]]; then
echo "错误:$RAG_PY 不存在"
exit 1
fi
# 终止旧进程
"/d/wangmeihua/rag/script/killname" $RAG_PY
# 启动新进程
"$PYTHON" "$RAG_PY" -w "/d/wangmeihua/rag"

View File

@ -1,46 +0,0 @@
#!/bin/bash
HOME_DIR="/share/wangmeihua"
RAG_DIR="/share/wangmeihua/rag"
PYTHON_VERSION="python3"
# 检查 Python 版本
if ! command -v "$PYTHON_VERSION" &> /dev/null; then
echo "错误:未找到 Python3"
exit 1
fi
# 检查 requirements.txt
if [[ ! -f "${RAG_DIR}/requirements.txt" ]]; then
echo "错误:${RAG_DIR}/requirements.txt 不存在"
exit 1
fi
# 创建虚拟环境
mkdir -p "${HOME_DIR}/bin"
"$PYTHON_VERSION" -m venv "${HOME_DIR}/py3"
source "${HOME_DIR}/py3/bin/activate"
# 备份 .bashrc
if [[ -f "${HOME_DIR}/.bashrc" ]]; then
cp "${HOME_DIR}/.bashrc" "${HOME_DIR}/.bashrc.bak"
fi
# 配置环境变量
cat >> "${HOME_DIR}/.bashrc" << EOF
export PATH="${HOME_DIR}/bin:${HOME_DIR}/py3/bin:\$PATH"
source "${HOME_DIR}/py3/bin/activate"
EOF
# 安装依赖
pip install -r "${RAG_DIR}/requirements.txt"
if [[ $? -ne 0 ]]; then
echo "错误:依赖安装失败"
exit 1
fi
# 复制并授权 killname
cp killname "${HOME_DIR}/bin"
chmod +x "${HOME_DIR}/bin/killname"
echo "环境配置完成!"

View File

@ -1,41 +0,0 @@
谷歌 industry 搜索引擎 org concept
知识图谱 Web 3.0 万维网 concept media
Web is a list of 网的 unk time
自顶向下 百科类网站 结构化数据源 concept media
结构化数据 <org> 关系数据库 concept media
非结构化数据 subclass of XML concept org
模式层 subclass of 知识图谱 concept media
结构化知识库 subclass of 知识图谱 concept misc
比尔盖茨 employer 微软 per org
5 信息抽取 facet of 数据层 media concept
信息抽取 part of 知识图谱 concept media
实体识别 subclass of 信息抽取 concept media
实体分类体系 part of 112种实体类别 concept misc
分类研究 实体类别 面向开放域的实体识别 concept media
服务器日志 特征建模 搜索引擎 concept org
关系抽取 subclass of Relation Extraction concept unk
模式匹配 实体 语料 concept media
属性抽取 <misc> 统计机器学习 concept media
属性 subclass of 实体 concept misc
数据挖掘 subclass of 结构化数据 concept media
拼图碎片 非结构化 信息抽取 concept media
歧义 used by 实体消歧 concept media
共指消解 自然语言处理 信息检索 concept misc
外部知识库 结构化数据 知识图谱 concept media
数据层的融合 模式层 关系数据库 concept media
资源描述框架 <media> 本体构建本体 concept org
DB2RDF subclass of 结构化的历史数据 cel date
自动化本体构建过程 本体库 数据驱动的自动化方式 concept media
阿里 owned by 阿里巴巴 org media
上下位关系 阿里巴巴 图谱 concept media
腾讯 owned by 阿里巴巴 org concept
知识图谱 location 城市 concept loc
串联 规则 推理策略的一环 concept media
算法 part of 知识库 concept media
知识库的更新 subclass of 概念层 concept media
知识图谱 part of 数据层 concept media
总结 part of 知识图谱 concept media
知识图谱 移动个人助理(Siri 智能语义搜索 concept media
(Sri) subclass of 的知识 eve unk
病毒 知识图谱 埃博拉病毒的症状有哪些 concept media
症状 part of 三元组 concept misc

View File

@ -1,41 +0,0 @@
谷歌 industry 搜索引擎 org concept
知识图谱 Web 3.0 万维网 concept media
Web is a list of 网的 unk time
自顶向下 百科类网站 结构化数据源 concept media
结构化数据 <org> 关系数据库 concept media
非结构化数据 subclass of XML concept org
模式层 subclass of 知识图谱 concept media
结构化知识库 subclass of 知识图谱 concept misc
比尔盖茨 employer 微软 per org
5 信息抽取 facet of 数据层 media concept
信息抽取 part of 知识图谱 concept media
实体识别 subclass of 信息抽取 concept media
实体分类体系 part of 112种实体类别 concept misc
分类研究 实体类别 面向开放域的实体识别 concept media
服务器日志 特征建模 搜索引擎 concept org
关系抽取 subclass of Relation Extraction concept unk
模式匹配 实体 语料 concept media
属性抽取 <misc> 统计机器学习 concept media
属性 subclass of 实体 concept misc
数据挖掘 subclass of 结构化数据 concept media
拼图碎片 非结构化 信息抽取 concept media
歧义 used by 实体消歧 concept media
共指消解 自然语言处理 信息检索 concept misc
外部知识库 结构化数据 知识图谱 concept media
数据层的融合 模式层 关系数据库 concept media
资源描述框架 <media> 本体构建本体 concept org
DB2RDF subclass of 结构化的历史数据 cel date
自动化本体构建过程 本体库 数据驱动的自动化方式 concept media
阿里 owned by 阿里巴巴 org media
上下位关系 阿里巴巴 图谱 concept media
腾讯 owned by 阿里巴巴 org concept
知识图谱 location 城市 concept loc
串联 规则 推理策略的一环 concept media
算法 part of 知识库 concept media
知识库的更新 subclass of 概念层 concept media
知识图谱 part of 数据层 concept media
总结 part of 知识图谱 concept media
知识图谱 移动个人助理(Siri 智能语义搜索 concept media
(Sri) subclass of 的知识 eve unk
病毒 知识图谱 埃博拉病毒的症状有哪些 concept media
症状 part of 三元组 concept misc