bugfix

2025-10-08 22:21:06 +08:00 · 2025-10-08 22:21:06 +08:00 · f2f6390608
commit f2f6390608
parent 542e5da159
1 changed files with 240 additions and 0 deletions
--- a/llmengine/mm_embedding.py
+++ b/llmengine/mm_embedding.py
@ -0,0 +1,240 @@
 # embed_all_unified.py
 """
 Unified multimodal embedder (text, image, video, audio)
 Features:
 - All modalities mapped to the same embedding space (CLIP or CLAP)
 - GPU/CPU/MPS auto detection
 - FP16 autocast for speed
 - Batch processing
 - Video frame sampling + average pooling
 - Audio resampling + CLAP embedding
 - L2 normalized output for similarity search
 """
 import os
 from pathlib import Path
 import numpy as np
 import torch
 from PIL import Image
 import av
 import librosa
 from concurrent.futures import ThreadPoolExecutor
 from math import ceil
 from appPublic.jsonConfig import getConfig
 from appPublic.worker import awaitify
 from ahserver.webapp import webapp
 from ahserver.serverenv import ServerEnv
 try:
    import face_recognition
    FACE_LIB_AVAILABLE = True
 except Exception:
    FACE_LIB_AVAILABLE = False
 # ------------------- Configuration -------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() else "cpu"
 USE_FP16 = DEVICE == "cuda"
 # Unified model for all modalities
 CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
 FRAME_SAMPLE_RATE = 1.0  # fps for video
 FRAME_LIMIT = 64
 AUDIO_SR = 16000  # resample audio
 # ------------------- Load model -------------------
 from transformers import CLIPProcessor, CLIPModel
 # ------------------- Utils -------------------
 def deduplicate_faces(face_embeddings, eps=0.4, min_samples=2):
    emb_norm = normalize(face_embeddings)
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(emb_norm)
    unique_faces = []
    for label in set(clustering.labels_):
        if label == -1:  # 噪声
            continue
        cluster_embs = emb_norm[clustering.labels_ == label]
        unique_faces.append(np.mean(cluster_embs, axis=0))
    return np.array(unique_faces)
 def l2_normalize(v):
    norm = np.linalg.norm(v)
    return v / norm if norm > 1e-10 else v
 def chunked(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]
 class MM_Embedding:
 	def __init__(self, model_name):
 		self.model = CLIPModel.from_pretrained(model_name).to(DEVICE)
 		self.processor = CLIPProcessor.from_pretrained(model_name)
 		if USE_FP16:
 			self.model.half()
 	# ------------------- Image -------------------
 	def embed_images(self, paths, batch_size=16):
 		results = {}
 		for batch in chunked(paths, batch_size):
 			imgs = [Image.open(p).convert("RGB") for p in batch]
 			inputs = self.processor(images=imgs, return_tensors="pt", padding=True).to(DEVICE)
 			with torch.no_grad():
 				if USE_FP16:
 					with torch.cuda.amp.autocast():
 						feats = self.model.get_image_features(**inputs)
 				else:
 					feats = self.model.get_image_features(**inputs)
 			feats = feats.cpu().numpy()
 			faces_list = []
 			for img in imgs:
 				faces = self.extract_faces(img)
 				face_vecs = self.embed_faces(img)
 				faces_list.append([faces, face_vecs])
 			for p, v, fs in zip(batch, feats, faces_list):
 				results[p] = {
 					'type':'image',
 					'path': p,
 					'faces': fs[0],
 					'face_vecs': fs[1],
 					'face_count':len(fs[0]),
 					'vector': l2_normalize(v)
 				}
 		return results
 	# ------------------- Text -------------------
 	def embed_texts(self, texts, batch_size=64):
 		results = {}
 		for batch in chunked(texts, batch_size):
 			inputs = self.processor(text=batch, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
 			with torch.no_grad():
 				if USE_FP16:
 					with torch.cuda.amp.autocast():
 						feats = self.model.get_text_features(**inputs)
 				else:
 					feats = self.model.get_text_features(**inputs)
 			feats = feats.cpu().numpy()
 			for t, v in zip(batch, feats):
 				results[t] = l2_normalize(v)
 		return results
 	# ------------------- Video -------------------
 	def embed_videos(self, paths, frame_rate=FRAME_SAMPLE_RATE, frame_limit=FRAME_LIMIT):
 		results = {}
 		for p in paths:
 			container = av.open(p)
 			frames = []
 			fps = float(container.streams.video[0].average_rate) if container.streams.video else 30.0
 			step = max(1, int(fps / max(1, frame_rate)))
 			count = 0
 			for i, frame in enumerate(container.decode(video=0)):
 				if i % step == 0:
 					frames.append(frame.to_image().convert("RGB"))
 					count += 1
 					if count >= frame_limit:
 						break
 			container.close()
 			if not frames:
 				results[p] = None
 				continue
 			# batch embed
 			emb_list = []
 			for batch in chunked(frames, 16):
 				inputs = self.processor(images=batch, return_tensors="pt", padding=True).to(DEVICE)
 				with torch.no_grad():
 					if USE_FP16:
 						with torch.cuda.amp.autocast():
 							feats = self.model.get_image_features(**inputs)
 					else:
 						feats = self.model.get_image_features(**inputs)
 				emb_list.append(feats.cpu().numpy())
 			emb_array = np.vstack(emb_list)
 			video_vec = l2_normalize(emb_array.mean(axis=0))
 			face_vecs = 
 			results[p] = video_vec
 		return results
 	# ------------------- Audio -------------------
 	def embed_audios(self, paths, batch_size=4):
 		results = {}
 		for p in paths:
 			y, sr = librosa.load(p, sr=AUDIO_SR, mono=True)
 			# convert to mel spectrogram image
 			S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=224)
 			S_db = librosa.power_to_db(S, ref=np.max)
 			img = Image.fromarray(np.uint8((S_db - S_db.min())/(S_db.max()-S_db.min()+1e-9)*255)).convert("RGB").resize((224,224))
 			inputs = self.processor(images=img, return_tensors="pt").to(DEVICE)
 			with torch.no_grad():
 				if USE_FP16:
 					with torch.cuda.amp.autocast():
 						feat = self.model.get_image_features(**inputs)
 				else:
 					feat = self.model.get_image_features(**inputs)
 			results[p] = l2_normalize(feat.cpu().numpy()[0])
 		return results
    def extract_faces(self, img: Image.Image):
        """返回裁剪后的人脸区域列表"""
        arr = np.array(img)
        face_locs = face_recognition.face_locations(arr)
        faces = []
        for (top, right, bottom, left) in face_locs:
            face = arr[top:bottom, left:right]
            faces.append(Image.fromarray(face))
        return faces
    def embed_faces(self, img: Image.Image):
        """提取人脸向量（face_recognition + CLIP）"""
        arr = np.array(img)
        encodings = face_recognition.face_encodings(arr)
        if not encodings:
            return []
        return [l2_normalize(np.array(e)) for e in encodings]
 	# ------------------- Dispatcher -------------------
 	def embed_batch(self, inputs):
 		groups = {"image":[], "video":[], "audio":[], "text":[]}
 		for item in inputs:
 			p = Path(item)
 			ext = item.lower()
 			if p.exists():
 				if any(ext.endswith(e) for e in [".jpg",".jpeg",".png",".bmp",".webp",".heic"]):
 					groups["image"].append(item)
 				elif any(ext.endswith(e) for e in [".mp4",".mov",".avi",".mkv"]):
 					groups["video"].append(item)
 				elif any(ext.endswith(e) for e in [".mp3",".wav",".flac"]):
 					groups["audio"].append(item)
 				else:
 					groups["text"].append(item)
 			else:
 				groups["text"].append(item)
 		outputs = {}
 		if groups["image"]:
 			outputs.update(embed_images(groups["image"]))
 		if groups["video"]:
 			outputs.update(embed_videos(groups["video"]))
 		if groups["audio"]:
 			outputs.update(embed_audios(groups["audio"]))
 		if groups["text"]:
 			outputs.update(embed_texts(groups["text"]))
 		return outputs
 def init():
 	env = ServerEnv()
 	config = getConfig()
 	env.mm_model = MM_Embedding(config.model_name)
 	env.embeded_batch = awaitify(env.mm_model.embeded_batch)
 # ------------------- CLI -------------------
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("inputs", nargs="+", help="file paths or text strings")
    parser.add_argument("--out", default="embeddings.npy")
    args = parser.parse_args()
    embeddings = embed_batch(args.inputs)
 	# save dict of name->vector
    out_dict = {k:v.tolist() for k,v in embeddings.items()}
    np.save(args.out, out_dict)
    print(f"Saved embeddings to {args.out}")