rag/app/embed.py

import os
from datetime import datetime
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from appPublic.log import debug
from appPublic.uniqueID import getID

from get_vector_db import get_vector_db

TEMP_FOLDER = os.getenv('TEMP_FOLDER', './_temp')

# Function to check if the uploaded file is allowed (only PDF files)
def allowed_file(filename):
    allowed_file_subffix = ['pdf','doc', 'docx','xlsx', 'xls', 'ppt', 'pptx', 'csv', 'txt']
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_file_subffix

# Function to load and split the data from the PDF file
def load_and_split_data(file_path):
    # Load the PDF file and split the data into chunks
    data = None
    if file_path.lower().endswith('.pdf'):
        loader = UnstructuredPDFLoader(file_path=file_path)
    elif file_path.lower().endswith('.docx') or file_path.lower().endswith('.doc'):
        loader = UnstructuredWordDocumentLoader(file_path=file_path)
    elif file_path.lower().endswith('.pptx') or file_path.lower().endswith('.pptx'):
        loader = UnstructuredPowerPointLoader(file_path=file_path)
    elif file_path.lower().endswith('.xlsx') or file_path.lower().endswith('.xls'):
        loader = UnstructuredExcelLoader(file_path=file_path)
    elif file_path.lower().endswith('.csv'):
        loader = CSVLoader(file_path=file_path)
    else:
        loader = TextLoader(file_path=file_path)
	data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
    chunks = text_splitter.split_documents(data)

    return chunks

# Main function to handle the embedding process
def embed(file_path, userid, kdbname):
    if allowed_file(file_path):
        chunks = load_and_split_data(file_path)
        debug(f'{chunks=}')
        db = get_vector_db(userid, kdbname)
        db.add(
                documents=[c.page_content for c in chunks],
                metadatas=[c.metadata for c in chunks],
                ids=[getID() for c in chunks]
        )
        return True
    return False