import os from datetime import datetime from langchain_community.document_loaders.csv_loader import CSVLoader from langchain_community.document_loaders.text import TextLoader from langchain_community.document_loaders import UnstructuredPDFLoader from langchain_community.document_loaders import UnstructuredWordDocumentLoader from langchain_community.document_loaders import UnstructuredExcelLoader from langchain_community.document_loaders import UnstructuredPowerPointLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from appPublic.log import debug from appPublic.uniqueID import getID from get_vector_db import get_vector_db TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp') # Function to check if the uploaded file is allowed (only PDF files) def allowed_file(filename): allowed_file_subffix = ['pdf','doc', 'docx','xlsx', 'xls', 'ppt', 'pptx', 'csv', 'txt'] return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_file_subffix # Function to load and split the data from the PDF file def load_and_split_data(file_path): # Load the PDF file and split the data into chunks data = None if file_path.lower().endswith('.pdf'): loader = UnstructuredPDFLoader(file_path=file_path) elif file_path.lower().endswith('.docx') or file_path.lower().endswith('.doc'): loader = UnstructuredWordDocumentLoader(file_path=file_path) elif file_path.lower().endswith('.pptx') or file_path.lower().endswith('.pptx'): loader = UnstructuredPowerPointLoader(file_path=file_path) elif file_path.lower().endswith('.xlsx') or file_path.lower().endswith('.xls'): loader = UnstructuredExcelLoader(file_path=file_path) elif file_path.lower().endswith('.csv'): loader = CSVLoader(file_path=file_path) else: loader = TextLoader(file_path=file_path) data = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100) chunks = text_splitter.split_documents(data) return chunks # Main function to handle the embedding process def embed(file_path, userid, kdbname): if allowed_file(file_path): chunks = load_and_split_data(file_path) debug(f'{chunks=}') db = get_vector_db(userid, kdbname) db.add( documents=[c.page_content for c in chunks], metadatas=[c.metadata for c in chunks], ids=[getID() for c in chunks] ) return True return False