Spaces:
Runtime error
Runtime error
| import os | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| import tempfile | |
| import imageio | |
| from decord import VideoReader, cpu | |
| from transformers import pipeline | |
| from PIL import Image | |
| hf_token = os.environ.get("HUGGINGFACE_TOKEN") | |
| model_id = "google/gemma-3-27b-it" | |
| NUM_FRAMES = 8 | |
| # 从视频中采样 N 帧 | |
| def sample_video_frames(video_path, num_frames=8): | |
| vr = VideoReader(video_path, ctx=cpu(0)) | |
| total_frames = len(vr) | |
| indices = [int(i) for i in torch.linspace(0, total_frames - 1, steps=num_frames)] | |
| # 关键点:强制转换为 PIL.Image | |
| frames = [Image.fromarray(vr[i].asnumpy()) for i in indices] | |
| return frames | |
| # 推理函数:加载模型、采样视频帧、推理 | |
| def analyze_video(video_file): | |
| # video_file 是路径字符串 | |
| frames = sample_video_frames(video_file) | |
| # 构造 prompt | |
| system_prompt = ( | |
| "You are a helpful AI assistant that analyzes AR effects in videos. " | |
| "Evaluate the realism and placement of virtual objects in the provided video frames." | |
| ) | |
| user_prompt = "Based on the frames, describe how well the AR objects blend into the real environment." | |
| history = [ | |
| { | |
| "role": "system", | |
| "content": [{"type": "text", "text": system_prompt}] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [{"type": "text", "text": user_prompt}] + [{"type": "image", "image": frame} for frame in frames] | |
| } | |
| ] | |
| pipe = pipeline( | |
| "image-text-to-text", | |
| model=model_id, | |
| token=hf_token, | |
| torch_dtype=torch.bfloat16, | |
| model_kwargs={"device_map": "auto"} | |
| ) | |
| result = pipe(text=history, max_new_tokens=512) | |
| return result[0]["generated_text"][-1]["content"] | |
| # Gradio 界面 | |
| gr.Interface( | |
| fn=analyze_video, | |
| inputs=gr.Video(label="Upload an AR Video (.mp4 only)"), | |
| outputs=gr.Textbox(label="Gemma Analysis Result"), | |
| title="Gemma-3-27B Video Analysis (ZeroGPU)", | |
| description="Uploads a video, extracts 8 frames, and uses Gemma-3-27B to analyze AR realism." | |
| ).launch() | |