diff --git a/llmengine/mm_embedding.py b/llmengine/mm_embedding.py index f788be6..455e9b2 100644 --- a/llmengine/mm_embedding.py +++ b/llmengine/mm_embedding.py @@ -9,8 +9,20 @@ Features: - Video frame sampling + average pooling - Audio resampling + CLAP embedding - L2 normalized output for similarity search + +model_name='/data/ymq/models/laion/CLIP-ViT-B-32-laion2B-s34B-b79K' + +impput: + +text: +{ + "type":"text, + "text":"...." +} +image: """ + import os from pathlib import Path import numpy as np @@ -35,6 +47,13 @@ except Exception: DEVICE = "cuda" if torch.cuda.is_available() else "mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu" USE_FP16 = DEVICE == "cuda" +def choose_device(): + if torch.cuda.is_available(): + return "cuda" + if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): + return "mps" + return "cpu" + # Unified model for all modalities CLIP_MODEL_NAME = "openai/clip-vit-large-patch14" FRAME_SAMPLE_RATE = 1.0 # fps for video