58 lines
2.5 KiB
Python
58 lines
2.5 KiB
Python
import os
|
|
from datetime import datetime
|
|
from langchain_community.document_loaders.csv_loader import CSVLoader
|
|
from langchain_community.document_loaders.text import TextLoader
|
|
from langchain_community.document_loaders import UnstructuredPDFLoader
|
|
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
|
from langchain_community.document_loaders import UnstructuredExcelLoader
|
|
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
from appPublic.log import debug
|
|
from appPublic.uniqueID import getID
|
|
|
|
from get_vector_db import get_vector_db
|
|
|
|
TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')
|
|
|
|
# Function to check if the uploaded file is allowed (only PDF files)
|
|
def allowed_file(filename):
|
|
allowed_file_subffix = ['pdf','doc', 'docx','xlsx', 'xls', 'ppt', 'pptx', 'csv', 'txt']
|
|
return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_file_subffix
|
|
|
|
# Function to load and split the data from the PDF file
|
|
def load_and_split_data(file_path):
|
|
# Load the PDF file and split the data into chunks
|
|
data = None
|
|
if file_path.lower().endswith('.pdf'):
|
|
loader = UnstructuredPDFLoader(file_path=file_path)
|
|
elif file_path.lower().endswith('.docx') or file_path.lower().endswith('.doc'):
|
|
loader = UnstructuredWordDocumentLoader(file_path=file_path)
|
|
elif file_path.lower().endswith('.pptx') or file_path.lower().endswith('.pptx'):
|
|
loader = UnstructuredPowerPointLoader(file_path=file_path)
|
|
elif file_path.lower().endswith('.xlsx') or file_path.lower().endswith('.xls'):
|
|
loader = UnstructuredExcelLoader(file_path=file_path)
|
|
elif file_path.lower().endswith('.csv'):
|
|
loader = CSVLoader(file_path=file_path)
|
|
else:
|
|
loader = TextLoader(file_path=file_path)
|
|
data = loader.load()
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
|
|
chunks = text_splitter.split_documents(data)
|
|
|
|
return chunks
|
|
|
|
# Main function to handle the embedding process
|
|
def embed(file_path, userid, kdbname):
|
|
if allowed_file(file_path):
|
|
chunks = load_and_split_data(file_path)
|
|
debug(f'{chunks=}')
|
|
db = get_vector_db(userid, kdbname)
|
|
db.add(
|
|
documents=[c.page_content for c in chunks],
|
|
metadatas=[c.metadata for c in chunks],
|
|
ids=[getID() for c in chunks]
|
|
)
|
|
return True
|
|
return False
|