rag/app/embed.py
2025-07-16 15:06:59 +08:00

58 lines
2.5 KiB
Python

import os
from datetime import datetime
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from appPublic.log import debug
from appPublic.uniqueID import getID
from get_vector_db import get_vector_db
TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')
# Function to check if the uploaded file is allowed (only PDF files)
def allowed_file(filename):
allowed_file_subffix = ['pdf','doc', 'docx','xlsx', 'xls', 'ppt', 'pptx', 'csv', 'txt']
return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_file_subffix
# Function to load and split the data from the PDF file
def load_and_split_data(file_path):
# Load the PDF file and split the data into chunks
data = None
if file_path.lower().endswith('.pdf'):
loader = UnstructuredPDFLoader(file_path=file_path)
elif file_path.lower().endswith('.docx') or file_path.lower().endswith('.doc'):
loader = UnstructuredWordDocumentLoader(file_path=file_path)
elif file_path.lower().endswith('.pptx') or file_path.lower().endswith('.pptx'):
loader = UnstructuredPowerPointLoader(file_path=file_path)
elif file_path.lower().endswith('.xlsx') or file_path.lower().endswith('.xls'):
loader = UnstructuredExcelLoader(file_path=file_path)
elif file_path.lower().endswith('.csv'):
loader = CSVLoader(file_path=file_path)
else:
loader = TextLoader(file_path=file_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)
return chunks
# Main function to handle the embedding process
def embed(file_path, userid, kdbname):
if allowed_file(file_path):
chunks = load_and_split_data(file_path)
debug(f'{chunks=}')
db = get_vector_db(userid, kdbname)
db.add(
documents=[c.page_content for c in chunks],
metadatas=[c.metadata for c in chunks],
ids=[getID() for c in chunks]
)
return True
return False