Spaces:
Running
Running
| import * as React from "react"; | |
| import { useState, useRef, useEffect } from "react"; | |
| import { useVLMContext } from "../context/useVLMContext"; | |
| import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; | |
| const MODES = ["File"] as const; | |
| type Mode = typeof MODES[number]; | |
| const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4"; | |
| const EXAMPLE_PROMPT = "Describe the video"; | |
| function isImageFile(file: File) { | |
| return file.type.startsWith("image/"); | |
| } | |
| function isVideoFile(file: File) { | |
| return file.type.startsWith("video/"); | |
| } | |
| function denormalizeBox(box: number[], width: number, height: number) { | |
| // If all values are between 0 and 1, treat as normalized | |
| if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) { | |
| return [ | |
| box[0] * width, | |
| box[1] * height, | |
| box[2] * width, | |
| box[3] * height | |
| ]; | |
| } | |
| return box; | |
| } | |
| // Add this robust fallback parser near the top | |
| function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] { | |
| // Try to parse as JSON first | |
| try { | |
| const parsed = JSON.parse(output); | |
| if (Array.isArray(parsed)) { | |
| const result: { label: string, bbox_2d: number[] }[] = []; | |
| for (const obj of parsed) { | |
| if (obj && obj.label && Array.isArray(obj.bbox_2d)) { | |
| if (Array.isArray(obj.bbox_2d[0])) { | |
| for (const arr of obj.bbox_2d) { | |
| if (Array.isArray(arr) && arr.length === 4) { | |
| result.push({ label: obj.label, bbox_2d: arr }); | |
| } | |
| } | |
| } else if (obj.bbox_2d.length === 4) { | |
| result.push({ label: obj.label, bbox_2d: obj.bbox_2d }); | |
| } | |
| } | |
| } | |
| if (result.length > 0) return result; | |
| } | |
| } catch (e) {} | |
| // Fallback: extract all [x1, y1, x2, y2] arrays from the string | |
| const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g; | |
| const boxes: { label: string, bbox_2d: number[] }[] = []; | |
| let match; | |
| while ((match = boxRegex.exec(output)) !== null) { | |
| const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])]; | |
| boxes.push({ label: '', bbox_2d: arr }); | |
| } | |
| return boxes; | |
| } | |
| // NOTE: You must install onnxruntime-web: | |
| // npm install onnxruntime-web | |
| // @ts-ignore | |
| import * as ort from 'onnxruntime-web'; | |
| // If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web'; | |
| // Set your YOLOv8 ONNX model URL here: | |
| const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE | |
| // Add these constants to match the YOLOv8 input size | |
| const YOLOV8_INPUT_WIDTH = 640; | |
| const YOLOV8_INPUT_HEIGHT = 480; | |
| // 1. Load the ONNX model once | |
| let yoloSession: ort.InferenceSession | null = null; | |
| // Add a busy flag to prevent concurrent YOLOv8 inferences | |
| let isYoloBusy = false; | |
| async function loadYoloModel() { | |
| if (!yoloSession) { | |
| yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL); | |
| } | |
| return yoloSession; | |
| } | |
| // COCO class names for YOLOv8 | |
| const YOLO_CLASSES: string[] = [ | |
| "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", | |
| "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", | |
| "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", | |
| "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", | |
| "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", | |
| "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", | |
| "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", | |
| "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" | |
| ]; | |
| // Preprocess video frame to YOLOv8 input tensor [1,3,640,640] | |
| function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor { | |
| const width = 640; | |
| const height = 480; | |
| const canvas = document.createElement('canvas'); | |
| canvas.width = width; | |
| canvas.height = height; | |
| const ctx = canvas.getContext('2d'); | |
| if (!ctx) throw new Error('Could not get 2D context'); | |
| ctx.drawImage(video, 0, 0, width, height); | |
| const imageData = ctx.getImageData(0, 0, width, height); | |
| const { data } = imageData; | |
| // Convert to Float32Array [1,3,480,640], normalize to [0,1] | |
| const floatData = new Float32Array(1 * 3 * height * width); | |
| for (let i = 0; i < width * height; i++) { | |
| floatData[i] = data[i * 4] / 255; // R | |
| floatData[i + width * height] = data[i * 4 + 1] / 255; // G | |
| floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B | |
| } | |
| return new ort.Tensor('float32', floatData, [1, 3, height, width]); | |
| } | |
| // Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters | |
| function postprocessYoloOutput(output: ort.Tensor) { | |
| // output.dims: [1, num_detections, 6] | |
| const data = output.data; | |
| const numDetections = output.dims[1]; | |
| const results = []; | |
| for (let i = 0; i < numDetections; i++) { | |
| const offset = i * 6; | |
| const x1 = data[offset]; | |
| const y1 = data[offset + 1]; | |
| const x2 = data[offset + 2]; | |
| const y2 = data[offset + 3]; | |
| const score = data[offset + 4]; | |
| const classId = data[offset + 5]; | |
| if (score < 0.2) continue; // adjust threshold as needed | |
| results.push({ | |
| bbox: [x1, y1, x2, y2], | |
| label: YOLO_CLASSES[classId] || `class_${classId}`, | |
| score | |
| }); | |
| } | |
| return results; | |
| } | |
| // Helper type guard for annotation | |
| function hasAnnotation(obj: any): obj is { annotation: string } { | |
| return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string'; | |
| } | |
| export default function MultiSourceCaptioningView() { | |
| const [mode, setMode] = useState<Mode>("File"); | |
| const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL); | |
| const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT); | |
| const [processing, setProcessing] = useState(false); | |
| const [error, setError] = useState<string | null>(null); | |
| const [uploadedFile, setUploadedFile] = useState<File | null>(null); | |
| const [uploadedUrl, setUploadedUrl] = useState<string>(""); | |
| const [videoProcessing, setVideoProcessing] = useState(false); | |
| const [imageProcessed, setImageProcessed] = useState(false); | |
| const [exampleProcessing, setExampleProcessing] = useState(false); | |
| const [debugOutput, setDebugOutput] = useState<string>(""); | |
| const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null); | |
| const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null); | |
| const [inferenceStatus, setInferenceStatus] = useState<string>(""); | |
| const [showProcessingVideo, setShowProcessingVideo] = useState(false); | |
| const videoRef = useRef<HTMLVideoElement | null>(null); | |
| const overlayVideoRef = useRef<HTMLVideoElement | null>(null); | |
| const processingVideoRef = useRef<HTMLVideoElement | null>(null); | |
| const canvasRef = useRef<HTMLCanvasElement | null>(null); | |
| const imageRef = useRef<HTMLImageElement | null>(null); | |
| const boxHistoryRef = useRef<any[]>([]); | |
| // Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation) | |
| const lastYoloBoxesRef = React.useRef<any[]>([]); | |
| const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext(); | |
| // Remove videoProcessingRef and exampleProcessingRef | |
| // Add a single processingLoopRef | |
| const processingLoopRef = React.useRef(false); | |
| const processVideoLoop = async () => { | |
| if (!processingLoopRef.current) return; | |
| if (isYoloBusy) { | |
| // Optionally log: "Inference already running, skipping frame" | |
| requestAnimationFrame(processVideoLoop); | |
| return; | |
| } | |
| await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop | |
| // Schedule the next frame as soon as possible | |
| requestAnimationFrame(processVideoLoop); | |
| }; | |
| const processExampleLoop = async () => { | |
| while (processingLoopRef.current) { | |
| await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop | |
| await new Promise(res => setTimeout(res, 1000)); | |
| } | |
| }; | |
| // Set your YOLOv8 ONNX backend API endpoint here: | |
| // const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE | |
| // Add this useEffect for overlay video synchronization | |
| useEffect(() => { | |
| const main = videoRef.current; | |
| const overlay = overlayVideoRef.current; | |
| if (!main || !overlay) return; | |
| // Sync play/pause | |
| const onPlay = () => { if (overlay.paused) overlay.play(); }; | |
| const onPause = () => { if (!overlay.paused) overlay.pause(); }; | |
| // Sync seeking and time | |
| const onSeekOrTime = () => { | |
| if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) { | |
| overlay.currentTime = main.currentTime; | |
| } | |
| }; | |
| main.addEventListener('play', onPlay); | |
| main.addEventListener('pause', onPause); | |
| main.addEventListener('seeked', onSeekOrTime); | |
| main.addEventListener('timeupdate', onSeekOrTime); | |
| // Clean up | |
| return () => { | |
| main.removeEventListener('play', onPlay); | |
| main.removeEventListener('pause', onPause); | |
| main.removeEventListener('seeked', onSeekOrTime); | |
| main.removeEventListener('timeupdate', onSeekOrTime); | |
| }; | |
| }, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]); | |
| useEffect(() => { | |
| if ((mode === "File") && processingVideoRef.current) { | |
| processingVideoRef.current.play().catch(() => {}); | |
| } | |
| }, [mode, videoUrl, uploadedUrl]); | |
| // Remove old prompt-based box extraction logic and only use the above for video frames. | |
| const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => { | |
| const file = e.target.files?.[0] || null; | |
| setUploadedFile(file); | |
| setUploadedUrl(file ? URL.createObjectURL(file) : ""); | |
| setError(null); | |
| setImageProcessed(false); | |
| setVideoProcessing(false); | |
| setExampleProcessing(false); | |
| }; | |
| // Webcam mode: process frames with setInterval | |
| useEffect(() => { | |
| if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return; | |
| processVideoLoop(); | |
| }, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]); | |
| // Example video mode: process frames with setInterval | |
| useEffect(() => { | |
| if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return; | |
| processExampleLoop(); | |
| }, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]); | |
| // File mode: process uploaded image (only on button click) | |
| const handleProcessImage = async () => { | |
| if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return; | |
| const img = imageRef.current; | |
| const canvas = canvasRef.current; | |
| canvas.width = img.naturalWidth; | |
| canvas.height = img.naturalHeight; | |
| setCanvasDims({w:canvas.width,h:canvas.height}); | |
| setVideoDims({w:img.naturalWidth,h:img.naturalHeight}); | |
| const ctx = canvas.getContext("2d"); | |
| if (!ctx) return; | |
| ctx.drawImage(img, 0, 0, canvas.width, canvas.height); | |
| setProcessing(true); | |
| setError(null); | |
| setInferenceStatus("Running inference..."); | |
| await runInference(img, prompt, (output: string) => { | |
| setDebugOutput(output); | |
| setInferenceStatus("Inference complete."); | |
| ctx.drawImage(img, 0, 0, canvas.width, canvas.height); | |
| let boxes = extractAllBoundingBoxes(output); | |
| console.log("Model output:", output); | |
| console.log("Boxes after normalization:", boxes); | |
| console.log("Canvas size:", canvas.width, canvas.height); | |
| if (boxes.length > 0) { | |
| const [x1, y1, x2, y2] = boxes[0].bbox_2d; | |
| console.log("First box coords:", x1, y1, x2, y2); | |
| } | |
| if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid."); | |
| if (Array.isArray(boxes) && boxes.length > 0) { | |
| const scaleX = canvas.width / img.naturalWidth; | |
| const scaleY = canvas.height / img.naturalHeight; | |
| drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY }); | |
| } | |
| setImageProcessed(true); | |
| }); | |
| setProcessing(false); | |
| }; | |
| // File mode: process uploaded video frames (start/stop) | |
| const handleToggleVideoProcessing = () => { | |
| setVideoProcessing((prev: boolean) => { | |
| const next = !prev; | |
| // Always stop all loops before starting | |
| processingLoopRef.current = false; | |
| setTimeout(() => { | |
| if (next) { | |
| processingLoopRef.current = true; | |
| processVideoLoop(); | |
| } | |
| }, 50); | |
| return next; | |
| }); | |
| }; | |
| // Handle start/stop for example video processing | |
| const handleToggleExampleProcessing = () => { | |
| setExampleProcessing((prev: boolean) => { | |
| const next = !prev; | |
| // Always stop all loops before starting | |
| processingLoopRef.current = false; | |
| setTimeout(() => { | |
| if (next) { | |
| processingLoopRef.current = true; | |
| processVideoLoop(); | |
| } | |
| }, 50); | |
| return next; | |
| }); | |
| }; | |
| // Test draw box function | |
| const handleTestDrawBox = () => { | |
| if (!canvasRef.current) return; | |
| const canvas = canvasRef.current; | |
| const ctx = canvas.getContext("2d"); | |
| if (!ctx) return; | |
| ctx.clearRect(0, 0, canvas.width, canvas.height); | |
| ctx.strokeStyle = "#FF00FF"; | |
| ctx.lineWidth = 4; | |
| ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4)); | |
| ctx.font = "20px Arial"; | |
| ctx.fillStyle = "#FF00FF"; | |
| ctx.fillText("Test Box", 50, 35); | |
| }; | |
| useEffect(() => { | |
| const draw = () => { | |
| const overlayVideo = overlayVideoRef.current; | |
| const canvas = canvasRef.current; | |
| if (!overlayVideo || !canvas) return; | |
| const displayWidth = overlayVideo.clientWidth; | |
| const displayHeight = overlayVideo.clientHeight; | |
| canvas.width = displayWidth; | |
| canvas.height = displayHeight; | |
| const ctx = canvas.getContext("2d"); | |
| if (!ctx) return; | |
| ctx.clearRect(0, 0, canvas.width, canvas.height); | |
| const now = Date.now(); | |
| const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000); | |
| if (boxHistory.length > 0) { | |
| // Fix: Draw all boxes, even if bbox_2d is an array of arrays | |
| const denormalizedBoxes: any[] = []; | |
| for (const b of boxHistory) { | |
| if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) { | |
| // Multiple boxes per label | |
| for (const arr of b.bbox_2d) { | |
| if (Array.isArray(arr) && arr.length === 4) { | |
| denormalizedBoxes.push({ | |
| ...b, | |
| bbox_2d: denormalizeBox(arr, displayWidth, displayHeight) | |
| }); | |
| } | |
| } | |
| } else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) { | |
| // Single box | |
| denormalizedBoxes.push({ | |
| ...b, | |
| bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight) | |
| }); | |
| } | |
| } | |
| drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 }); | |
| } | |
| }; | |
| draw(); | |
| const interval = setInterval(draw, 100); | |
| // Redraw on window resize | |
| const handleResize = () => draw(); | |
| window.addEventListener('resize', handleResize); | |
| return () => { | |
| clearInterval(interval); | |
| window.removeEventListener('resize', handleResize); | |
| }; | |
| }, [overlayVideoRef, canvasRef]); | |
| // Drawing loop: draws the latest YOLOv8 boxes every frame | |
| React.useEffect(() => { | |
| let running = true; | |
| function drawLoop() { | |
| if (!running) return; | |
| const overlayVideo = overlayVideoRef.current; | |
| const canvas = canvasRef.current; | |
| const processingVideo = processingVideoRef.current; | |
| if (canvas && overlayVideo && processingVideo) { | |
| // Set canvas size to match the visible video | |
| canvas.width = overlayVideo.clientWidth; | |
| canvas.height = overlayVideo.clientHeight; | |
| const ctx = canvas.getContext('2d'); | |
| if (ctx) { | |
| ctx.clearRect(0, 0, canvas.width, canvas.height); | |
| // Draw all YOLOv8 boxes from last detection | |
| const yoloBoxes = lastYoloBoxesRef.current; | |
| yoloBoxes.forEach((obj: any) => { | |
| // Scale from YOLOv8 input size to canvas size | |
| const scaleX = canvas.width / YOLOV8_INPUT_WIDTH; | |
| const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT; | |
| const [x1, y1, x2, y2] = obj.bbox; | |
| const drawX = x1 * scaleX; | |
| const drawY = y1 * scaleY; | |
| const drawW = (x2 - x1) * scaleX; | |
| const drawH = (y2 - y1) * scaleY; | |
| ctx.strokeStyle = '#00FFFF'; | |
| ctx.lineWidth = 5; | |
| ctx.strokeRect(drawX, drawY, drawW, drawH); | |
| ctx.font = 'bold 22px Arial'; | |
| // Draw YOLOv8 label and confidence | |
| const yoloLabel = obj.label || ''; | |
| const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : ''; | |
| const yoloText = `${yoloLabel}${yoloScore}`; | |
| ctx.fillStyle = 'rgba(0,0,0,0.7)'; | |
| const yoloTextWidth = ctx.measureText(yoloText).width + 8; | |
| ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26); | |
| ctx.fillStyle = '#00FFFF'; | |
| ctx.fillText(yoloText, drawX, drawY - 4); | |
| // Draw FastVLM annotation below the box if available | |
| if (hasAnnotation(obj)) { | |
| ctx.font = 'bold 18px Arial'; | |
| ctx.fillStyle = 'rgba(0,0,0,0.7)'; | |
| const annTextWidth = ctx.measureText(obj.annotation).width + 8; | |
| ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24); | |
| ctx.fillStyle = '#00FFFF'; | |
| ctx.fillText(obj.annotation, drawX, drawY + drawH + 22); | |
| } | |
| }); | |
| } | |
| } | |
| requestAnimationFrame(drawLoop); | |
| } | |
| drawLoop(); | |
| return () => { running = false; }; | |
| }, [overlayVideoRef, canvasRef, processingVideoRef]); | |
| // YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background | |
| const yoloDetectionLoop = async () => { | |
| if (!processingLoopRef.current) return; | |
| if (isYoloBusy) { | |
| requestAnimationFrame(yoloDetectionLoop); | |
| return; | |
| } | |
| isYoloBusy = true; | |
| try { | |
| const processingVideo = processingVideoRef.current; | |
| if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) { | |
| isYoloBusy = false; | |
| requestAnimationFrame(yoloDetectionLoop); | |
| return; | |
| } | |
| // Run YOLOv8 detection | |
| const session = await loadYoloModel(); | |
| const inputTensor = preprocessFrameToTensor(processingVideo); | |
| const feeds: Record<string, ort.Tensor> = {}; | |
| feeds[session.inputNames[0]] = inputTensor; | |
| const results = await session.run(feeds); | |
| const output = results[session.outputNames[0]]; | |
| const detections = postprocessYoloOutput(output); | |
| lastYoloBoxesRef.current = detections; | |
| // Run FastVLM on the full frame (wait for YOLOv8 to finish) | |
| await runInference(processingVideo, prompt, (output: string) => { | |
| setDebugOutput(output); | |
| }); | |
| } catch (err) { | |
| console.error('YOLOv8+FastVLM error:', err); | |
| } finally { | |
| isYoloBusy = false; | |
| requestAnimationFrame(yoloDetectionLoop); | |
| } | |
| }; | |
| // Add this effect after the processing loop and toggle handlers | |
| useEffect(() => { | |
| // Stop processing loop on video source change or processing toggle | |
| processingLoopRef.current = false; | |
| // Start processing loop for the correct video after refs update | |
| setTimeout(() => { | |
| if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) { | |
| processingLoopRef.current = true; | |
| yoloDetectionLoop(); | |
| } else if (exampleProcessing && !uploadedFile) { | |
| processingLoopRef.current = true; | |
| yoloDetectionLoop(); | |
| } | |
| }, 100); | |
| // eslint-disable-next-line | |
| }, [uploadedFile, videoProcessing, exampleProcessing]); | |
| return ( | |
| <div className="absolute inset-0 text-white"> | |
| <div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50"> | |
| {isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"} | |
| </div> | |
| <div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div> | |
| <div className="flex flex-col items-center justify-center h-full w-full"> | |
| {/* Mode Selector */} | |
| <div className="mb-6"> | |
| <div className="flex space-x-4"> | |
| {MODES.map((m) => ( | |
| <button | |
| key={m} | |
| className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${ | |
| mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500" | |
| }`} | |
| onClick={() => setMode(m)} | |
| > | |
| {m} | |
| </button> | |
| ))} | |
| </div> | |
| </div> | |
| {/* Mode Content */} | |
| <div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center"> | |
| {mode === "File" && ( | |
| <div className="w-full text-center flex flex-col items-center"> | |
| <div className="mb-4 w-full max-w-xl"> | |
| <label className="block text-left mb-2 font-medium">Detection Prompt:</label> | |
| <textarea | |
| className="w-full p-2 rounded-lg text-black" | |
| rows={3} | |
| value={prompt} | |
| onChange={(e) => setPrompt(e.target.value)} | |
| /> | |
| </div> | |
| <div className="mb-4 w-full max-w-xl"> | |
| <input | |
| type="file" | |
| accept="image/*,video/*" | |
| onChange={handleFileChange} | |
| className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700" | |
| /> | |
| </div> | |
| {/* Add toggle button above video area */} | |
| <div className="mb-2 w-full max-w-xl flex justify-end"> | |
| <button | |
| className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`} | |
| onClick={() => setShowProcessingVideo(v => !v)} | |
| type="button" | |
| > | |
| {showProcessingVideo ? 'Hide' : 'Show'} Processed Video | |
| </button> | |
| </div> | |
| {/* Show uploaded image */} | |
| {uploadedFile && isImageFile(uploadedFile) && ( | |
| <div className="relative w-full max-w-xl"> | |
| <img | |
| ref={imageRef} | |
| src={uploadedUrl} | |
| alt="Uploaded" | |
| className="w-full rounded-lg shadow-lg mb-2" | |
| style={{ background: "#222" }} | |
| /> | |
| <canvas | |
| ref={canvasRef} | |
| className="absolute top-0 left-0 w-full h-full pointer-events-none" | |
| style={{ zIndex: 10, pointerEvents: "none" }} | |
| /> | |
| <button | |
| className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold" | |
| onClick={handleProcessImage} | |
| disabled={processing} | |
| > | |
| {processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"} | |
| </button> | |
| </div> | |
| )} | |
| {/* Show uploaded video */} | |
| {uploadedFile && isVideoFile(uploadedFile) && ( | |
| <div className="relative w-full max-w-xl" style={{ position: 'relative' }}> | |
| {/* Visible overlay video for user */} | |
| <video | |
| ref={overlayVideoRef} | |
| src={uploadedUrl} | |
| controls | |
| autoPlay | |
| loop | |
| muted | |
| playsInline | |
| className="w-full rounded-lg shadow-lg mb-2" | |
| style={{ background: "#222", display: "block" }} | |
| crossOrigin="anonymous" | |
| onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => { | |
| if (canvasRef.current) { | |
| canvasRef.current.width = e.currentTarget.clientWidth; | |
| canvasRef.current.height = e.currentTarget.clientHeight; | |
| } | |
| }} | |
| onResize={() => { | |
| if (canvasRef.current && overlayVideoRef.current) { | |
| canvasRef.current.width = overlayVideoRef.current.clientWidth; | |
| canvasRef.current.height = overlayVideoRef.current.clientHeight; | |
| } | |
| }} | |
| /> | |
| {/* Canvas overlay */} | |
| <canvas | |
| ref={canvasRef} | |
| style={{ | |
| position: "absolute", | |
| top: 0, | |
| left: 0, | |
| width: "100%", | |
| height: "100%", | |
| zIndex: 100, | |
| pointerEvents: "none", | |
| display: "block" | |
| }} | |
| width={overlayVideoRef.current?.clientWidth || 640} | |
| height={overlayVideoRef.current?.clientHeight || 480} | |
| /> | |
| {/* Hidden or visible processing video for FastVLM/canvas */} | |
| <video | |
| ref={processingVideoRef} | |
| src={uploadedUrl} | |
| autoPlay | |
| loop | |
| muted | |
| playsInline | |
| crossOrigin="anonymous" | |
| style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }} | |
| onLoadedData={e => { e.currentTarget.play().catch(() => {}); }} | |
| /> | |
| <button | |
| className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold" | |
| onClick={handleToggleVideoProcessing} | |
| > | |
| {videoProcessing ? "Stop Processing" : "Start Processing"} | |
| </button> | |
| </div> | |
| )} | |
| {/* Show example video if no file uploaded */} | |
| {!uploadedFile && ( | |
| <div className="relative w-full max-w-xl" style={{ position: 'relative' }}> | |
| {/* Visible overlay video for user */} | |
| <video | |
| ref={overlayVideoRef} | |
| src={EXAMPLE_VIDEO_URL} | |
| controls | |
| autoPlay | |
| loop | |
| muted | |
| playsInline | |
| className="w-full rounded-lg shadow-lg mb-2" | |
| style={{ background: "#222", display: "block" }} | |
| crossOrigin="anonymous" | |
| /> | |
| {/* Canvas overlay */} | |
| <canvas | |
| ref={canvasRef} | |
| style={{ | |
| position: "absolute", | |
| top: 0, | |
| left: 0, | |
| width: "100%", | |
| height: "100%", | |
| zIndex: 100, | |
| pointerEvents: "none", | |
| display: "block" | |
| }} | |
| width={overlayVideoRef.current?.clientWidth || 640} | |
| height={overlayVideoRef.current?.clientHeight || 480} | |
| /> | |
| {/* Hidden or visible processing video for FastVLM/canvas */} | |
| <video | |
| ref={processingVideoRef} | |
| src={EXAMPLE_VIDEO_URL} | |
| autoPlay | |
| loop | |
| muted | |
| playsInline | |
| crossOrigin="anonymous" | |
| style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }} | |
| onLoadedData={e => { e.currentTarget.play().catch(() => {}); }} | |
| /> | |
| <button | |
| className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold" | |
| onClick={handleToggleExampleProcessing} | |
| > | |
| {exampleProcessing ? "Stop Processing" : "Start Processing"} | |
| </button> | |
| </div> | |
| )} | |
| {processing && <div className="text-blue-400 mt-2">Processing frame...</div>} | |
| {error && <div className="text-red-400 mt-2">Error: {error}</div>} | |
| <button | |
| className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold" | |
| onClick={handleTestDrawBox} | |
| > | |
| Test Draw Box | |
| </button> | |
| <div className="mt-2 p-2 bg-gray-800 rounded text-xs"> | |
| <div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div> | |
| <div>Raw Model Output:</div> | |
| <pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre> | |
| </div> | |
| </div> | |
| )} | |
| </div> | |
| </div> | |
| </div> | |
| ); | |
| } |