import torch import gradio as gr from PIL import Image import requests import io from transformers import AutoImageProcessor, AutoModel # ======== 使用 DINOv2-Large(推荐)======== model_name = "facebook/dinov2-large" device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoImageProcessor.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name).to(device) def image_to_embedding(url): try: # 下载图像 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, timeout=10,headers=headers,stream=True) response.raise_for_status() except Exception as e: return {"error": f"Failed to download image: {e}"} image = Image.open(io.BytesIO(response.content)).convert("RGB") inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) # Dinov2Model 输出 last_hidden_state: [1, num_patches+1, 1024] # CLS token 在第一个位置 emb = outputs.last_hidden_state[:, 0, :] emb = torch.nn.functional.normalize(emb, p=2, dim=-1) return emb[0].cpu().numpy().tolist() def process(image): if image is None: return None, "No image uploaded" emb = image_to_embedding(image) return emb, len(emb) with gr.Blocks(title="DINOv2 Image Embedding") as demo: gr.Markdown("# DINOv2 Image Embedding Service") gr.Markdown("Upload an image to extract visual embedding") # 图像接口 gr.Interface( fn=image_to_embedding, inputs="text", # 输入 URL outputs="json", api_name="imageEmbedding" ) demo.launch()