bugfix
This commit is contained in:
parent
542e5da159
commit
f2f6390608
240
llmengine/mm_embedding.py
Normal file
240
llmengine/mm_embedding.py
Normal file
@ -0,0 +1,240 @@
|
||||
# embed_all_unified.py
|
||||
"""
|
||||
Unified multimodal embedder (text, image, video, audio)
|
||||
Features:
|
||||
- All modalities mapped to the same embedding space (CLIP or CLAP)
|
||||
- GPU/CPU/MPS auto detection
|
||||
- FP16 autocast for speed
|
||||
- Batch processing
|
||||
- Video frame sampling + average pooling
|
||||
- Audio resampling + CLAP embedding
|
||||
- L2 normalized output for similarity search
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
import av
|
||||
import librosa
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from math import ceil
|
||||
from appPublic.jsonConfig import getConfig
|
||||
from appPublic.worker import awaitify
|
||||
from ahserver.webapp import webapp
|
||||
from ahserver.serverenv import ServerEnv
|
||||
|
||||
try:
|
||||
import face_recognition
|
||||
FACE_LIB_AVAILABLE = True
|
||||
except Exception:
|
||||
FACE_LIB_AVAILABLE = False
|
||||
|
||||
# ------------------- Configuration -------------------
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu"
|
||||
USE_FP16 = DEVICE == "cuda"
|
||||
|
||||
# Unified model for all modalities
|
||||
CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
|
||||
FRAME_SAMPLE_RATE = 1.0 # fps for video
|
||||
FRAME_LIMIT = 64
|
||||
AUDIO_SR = 16000 # resample audio
|
||||
|
||||
# ------------------- Load model -------------------
|
||||
from transformers import CLIPProcessor, CLIPModel
|
||||
|
||||
# ------------------- Utils -------------------
|
||||
|
||||
def deduplicate_faces(face_embeddings, eps=0.4, min_samples=2):
|
||||
emb_norm = normalize(face_embeddings)
|
||||
clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(emb_norm)
|
||||
unique_faces = []
|
||||
for label in set(clustering.labels_):
|
||||
if label == -1: # 噪声
|
||||
continue
|
||||
cluster_embs = emb_norm[clustering.labels_ == label]
|
||||
unique_faces.append(np.mean(cluster_embs, axis=0))
|
||||
return np.array(unique_faces)
|
||||
|
||||
def l2_normalize(v):
|
||||
norm = np.linalg.norm(v)
|
||||
return v / norm if norm > 1e-10 else v
|
||||
|
||||
def chunked(lst, n):
|
||||
for i in range(0, len(lst), n):
|
||||
yield lst[i:i+n]
|
||||
|
||||
class MM_Embedding:
|
||||
def __init__(self, model_name):
|
||||
self.model = CLIPModel.from_pretrained(model_name).to(DEVICE)
|
||||
self.processor = CLIPProcessor.from_pretrained(model_name)
|
||||
if USE_FP16:
|
||||
self.model.half()
|
||||
|
||||
# ------------------- Image -------------------
|
||||
def embed_images(self, paths, batch_size=16):
|
||||
results = {}
|
||||
for batch in chunked(paths, batch_size):
|
||||
imgs = [Image.open(p).convert("RGB") for p in batch]
|
||||
inputs = self.processor(images=imgs, return_tensors="pt", padding=True).to(DEVICE)
|
||||
with torch.no_grad():
|
||||
if USE_FP16:
|
||||
with torch.cuda.amp.autocast():
|
||||
feats = self.model.get_image_features(**inputs)
|
||||
else:
|
||||
feats = self.model.get_image_features(**inputs)
|
||||
feats = feats.cpu().numpy()
|
||||
faces_list = []
|
||||
for img in imgs:
|
||||
faces = self.extract_faces(img)
|
||||
face_vecs = self.embed_faces(img)
|
||||
faces_list.append([faces, face_vecs])
|
||||
|
||||
for p, v, fs in zip(batch, feats, faces_list):
|
||||
results[p] = {
|
||||
'type':'image',
|
||||
'path': p,
|
||||
'faces': fs[0],
|
||||
'face_vecs': fs[1],
|
||||
'face_count':len(fs[0]),
|
||||
'vector': l2_normalize(v)
|
||||
}
|
||||
return results
|
||||
|
||||
# ------------------- Text -------------------
|
||||
def embed_texts(self, texts, batch_size=64):
|
||||
results = {}
|
||||
for batch in chunked(texts, batch_size):
|
||||
inputs = self.processor(text=batch, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
|
||||
with torch.no_grad():
|
||||
if USE_FP16:
|
||||
with torch.cuda.amp.autocast():
|
||||
feats = self.model.get_text_features(**inputs)
|
||||
else:
|
||||
feats = self.model.get_text_features(**inputs)
|
||||
feats = feats.cpu().numpy()
|
||||
for t, v in zip(batch, feats):
|
||||
results[t] = l2_normalize(v)
|
||||
return results
|
||||
|
||||
# ------------------- Video -------------------
|
||||
def embed_videos(self, paths, frame_rate=FRAME_SAMPLE_RATE, frame_limit=FRAME_LIMIT):
|
||||
results = {}
|
||||
for p in paths:
|
||||
container = av.open(p)
|
||||
frames = []
|
||||
fps = float(container.streams.video[0].average_rate) if container.streams.video else 30.0
|
||||
step = max(1, int(fps / max(1, frame_rate)))
|
||||
count = 0
|
||||
for i, frame in enumerate(container.decode(video=0)):
|
||||
if i % step == 0:
|
||||
frames.append(frame.to_image().convert("RGB"))
|
||||
count += 1
|
||||
if count >= frame_limit:
|
||||
break
|
||||
container.close()
|
||||
if not frames:
|
||||
results[p] = None
|
||||
continue
|
||||
# batch embed
|
||||
emb_list = []
|
||||
for batch in chunked(frames, 16):
|
||||
inputs = self.processor(images=batch, return_tensors="pt", padding=True).to(DEVICE)
|
||||
with torch.no_grad():
|
||||
if USE_FP16:
|
||||
with torch.cuda.amp.autocast():
|
||||
feats = self.model.get_image_features(**inputs)
|
||||
else:
|
||||
feats = self.model.get_image_features(**inputs)
|
||||
emb_list.append(feats.cpu().numpy())
|
||||
emb_array = np.vstack(emb_list)
|
||||
video_vec = l2_normalize(emb_array.mean(axis=0))
|
||||
face_vecs =
|
||||
results[p] = video_vec
|
||||
return results
|
||||
|
||||
# ------------------- Audio -------------------
|
||||
def embed_audios(self, paths, batch_size=4):
|
||||
results = {}
|
||||
for p in paths:
|
||||
y, sr = librosa.load(p, sr=AUDIO_SR, mono=True)
|
||||
# convert to mel spectrogram image
|
||||
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=224)
|
||||
S_db = librosa.power_to_db(S, ref=np.max)
|
||||
img = Image.fromarray(np.uint8((S_db - S_db.min())/(S_db.max()-S_db.min()+1e-9)*255)).convert("RGB").resize((224,224))
|
||||
inputs = self.processor(images=img, return_tensors="pt").to(DEVICE)
|
||||
with torch.no_grad():
|
||||
if USE_FP16:
|
||||
with torch.cuda.amp.autocast():
|
||||
feat = self.model.get_image_features(**inputs)
|
||||
else:
|
||||
feat = self.model.get_image_features(**inputs)
|
||||
results[p] = l2_normalize(feat.cpu().numpy()[0])
|
||||
return results
|
||||
|
||||
def extract_faces(self, img: Image.Image):
|
||||
"""返回裁剪后的人脸区域列表"""
|
||||
arr = np.array(img)
|
||||
face_locs = face_recognition.face_locations(arr)
|
||||
faces = []
|
||||
for (top, right, bottom, left) in face_locs:
|
||||
face = arr[top:bottom, left:right]
|
||||
faces.append(Image.fromarray(face))
|
||||
return faces
|
||||
|
||||
def embed_faces(self, img: Image.Image):
|
||||
"""提取人脸向量(face_recognition + CLIP)"""
|
||||
arr = np.array(img)
|
||||
encodings = face_recognition.face_encodings(arr)
|
||||
if not encodings:
|
||||
return []
|
||||
return [l2_normalize(np.array(e)) for e in encodings]
|
||||
|
||||
# ------------------- Dispatcher -------------------
|
||||
def embed_batch(self, inputs):
|
||||
groups = {"image":[], "video":[], "audio":[], "text":[]}
|
||||
for item in inputs:
|
||||
p = Path(item)
|
||||
ext = item.lower()
|
||||
if p.exists():
|
||||
if any(ext.endswith(e) for e in [".jpg",".jpeg",".png",".bmp",".webp",".heic"]):
|
||||
groups["image"].append(item)
|
||||
elif any(ext.endswith(e) for e in [".mp4",".mov",".avi",".mkv"]):
|
||||
groups["video"].append(item)
|
||||
elif any(ext.endswith(e) for e in [".mp3",".wav",".flac"]):
|
||||
groups["audio"].append(item)
|
||||
else:
|
||||
groups["text"].append(item)
|
||||
else:
|
||||
groups["text"].append(item)
|
||||
outputs = {}
|
||||
if groups["image"]:
|
||||
outputs.update(embed_images(groups["image"]))
|
||||
if groups["video"]:
|
||||
outputs.update(embed_videos(groups["video"]))
|
||||
if groups["audio"]:
|
||||
outputs.update(embed_audios(groups["audio"]))
|
||||
if groups["text"]:
|
||||
outputs.update(embed_texts(groups["text"]))
|
||||
return outputs
|
||||
|
||||
def init():
|
||||
env = ServerEnv()
|
||||
config = getConfig()
|
||||
env.mm_model = MM_Embedding(config.model_name)
|
||||
env.embeded_batch = awaitify(env.mm_model.embeded_batch)
|
||||
# ------------------- CLI -------------------
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("inputs", nargs="+", help="file paths or text strings")
|
||||
parser.add_argument("--out", default="embeddings.npy")
|
||||
args = parser.parse_args()
|
||||
|
||||
embeddings = embed_batch(args.inputs)
|
||||
# save dict of name->vector
|
||||
out_dict = {k:v.tolist() for k,v in embeddings.items()}
|
||||
np.save(args.out, out_dict)
|
||||
print(f"Saved embeddings to {args.out}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user