import React, { useState, useRef, useEffect, useCallback } from "react"; import { useVLMContext } from "../context/useVLMContext"; import { extractJsonFromMarkdown, drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; const MODES = ["Webcam", "URL", "File"] as const; type Mode = typeof MODES[number]; const EXAMPLE_VIDEO_URL = "/videos/1.mp4"; // Ensure this path is correct const EXAMPLE_PROMPT = "Detect all people in the image. For each person, output a JSON array of objects with fields: 'label' (string) and 'bbox_2d' ([x1, y1, x2, y2]) where coordinates are in pixel values. Example: [{\"label\": \"person\", \"bbox_2d\": [100, 50, 200, 300]}]"; // Helper functions (remain the same) function parseFlatBoxArray(arr: any[]): { label: string, bbox_2d: number[] }[] { if (typeof arr[0] === "string" && Array.isArray(arr[1])) { const label = arr[0]; return arr.slice(1).map(bbox => ({ label, bbox_2d: bbox })); } return []; } function normalizeBoxes(raw: any): { label: string, bbox_2d: number[] }[] { if (!raw) return []; let boxes = []; if (typeof raw === "object" && raw !== null && Array.isArray(raw.image)) { boxes = raw.image; } else if (Array.isArray(raw)) { boxes = raw; } else if (typeof raw === "object" && raw !== null) { boxes = [raw]; } return boxes .map((obj: any) => { if (!obj || !obj.bbox_2d) return null; let bbox = obj.bbox_2d; if ( Array.isArray(bbox) && bbox.length === 2 && Array.isArray(bbox[0]) && Array.isArray(bbox[1]) && bbox[0].length === 2 && bbox[1].length === 2 ) { bbox = [bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]; } if ( Array.isArray(bbox) && bbox.length === 4 && bbox.every((v: any) => typeof v === "number") ) { return { ...obj, bbox_2d: bbox }; } return null; }) .filter((obj: any) => obj); } function isImageFile(file: File) { return file.type.startsWith("image/"); } function isVideoFile(file: File) { return file.type.startsWith("video/"); } export default function MultiSourceCaptioningView() { const [mode, setMode] = useState("File"); const [currentUrlInput, setCurrentUrlInput] = useState(EXAMPLE_VIDEO_URL); const [prompt, setPrompt] = useState(EXAMPLE_PROMPT); const [processingState, setProcessingState] = useState(false); // General processing indicator const [error, setError] = useState(null); const [mediaStream, setMediaStream] = useState(null); // For webcam stream const [latestBoxes, setLatestBoxes] = useState([]); // State for boxes to draw const [inferenceStatus, setInferenceStatus] = useState(""); const [debugOutput, setDebugOutput] = useState(""); // Refs for the two video elements and the canvas const displayVideoRef = useRef(null); // The visible video const vlmVideoRef = useRef(null); // The hidden video for VLM processing const canvasRef = useRef(null); // The canvas overlay for drawing boxes const imageRef = useRef(null); // For image file processing const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext(); // --- Drawing Loop for the Visible Display --- // This loop runs constantly to draw the latest boxes on the display video const drawDisplayCanvas = useCallback(() => { const displayVideo = displayVideoRef.current; const canvas = canvasRef.current; const ctx = canvas?.getContext('2d'); if (!displayVideo || !canvas || !ctx) { return; } // Adjust canvas size to match the display video's dimensions if (canvas.width !== displayVideo.videoWidth || canvas.height !== displayVideo.videoHeight) { canvas.width = displayVideo.videoWidth; canvas.height = displayVideo.videoHeight; } // Clear the canvas each frame ctx.clearRect(0, 0, canvas.width, canvas.height); // Draw the latest bounding boxes const scaleX = canvas.width / (displayVideo.videoWidth || 1); // Avoid division by zero const scaleY = canvas.height / (displayVideo.videoHeight || 1); drawBoundingBoxesOnCanvas(ctx, latestBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX, scaleY }); // Only request next frame if video is playing to avoid unnecessary redraws when paused/ended if (!displayVideo.paused && !displayVideo.ended) { requestAnimationFrame(drawDisplayCanvas); } }, [latestBoxes]); // Re-create if latestBoxes changes // Effect to start the display drawing loop when the display video is ready useEffect(() => { const displayVideo = displayVideoRef.current; if (displayVideo) { const handleVideoReady = () => { // Start the requestAnimationFrame loop once the video has loaded metadata if (displayVideo.readyState >= 1) { // HAVE_METADATA requestAnimationFrame(drawDisplayCanvas); } }; displayVideo.addEventListener('loadedmetadata', handleVideoReady); // Also check if video is already ready (e.g., on component re-mount) if (displayVideo.readyState >= 1) { requestAnimationFrame(drawDisplayCanvas); } return () => { displayVideo.removeEventListener('loadedmetadata', handleVideoReady); }; } }, [drawDisplayCanvas]); // --- FastVLM Processing Loop (from hidden video/image) --- // This interval loop controls when FastVLM processes a frame useEffect(() => { const vlmVideo = vlmVideoRef.current; const isVideoMode = (mode === "Webcam" || (mode === "URL" && vlmVideo?.src) || (mode === "File" && vlmVideo?.src && isVideoFile(uploadedFile || null))); if (!isLoaded || !vlmVideo || !isVideoMode) { // If not in a video mode or VLM/video not ready, ensure processing stops setProcessingState(false); return; } let interval: ReturnType | null = null; const startVLMProcessing = () => { if (interval) clearInterval(interval); // Clear any old interval interval = setInterval(async () => { if (!vlmVideo || vlmVideo.paused || vlmVideo.ended || vlmVideo.videoWidth === 0 || processingState) { return; // Skip if video not ready, paused, ended, or already processing } setProcessingState(true); setInferenceStatus("Running inference..."); setError(null); // Clear previous errors try { // Create a temporary offscreen canvas to get image data from the VLM video const tempCanvas = document.createElement('canvas'); tempCanvas.width = vlmVideo.videoWidth; tempCanvas.height = vlmVideo.videoHeight; const tempCtx = tempCanvas.getContext('2d', { willReadFrequently: true }); if (tempCtx && vlmVideo.readyState >= 2) { // HAVE_CURRENT_DATA tempCtx.drawImage(vlmVideo, 0, 0, tempCanvas.width, tempCanvas.height); const imageData = tempCtx.getImageData(0, 0, tempCanvas.width, tempCanvas.height); const modelOutput = await runInference(imageData, prompt); // Pass ImageData setDebugOutput(modelOutput); // Update raw model output let boxes = extractJsonFromMarkdown(modelOutput) || []; if (boxes.length === 0 && Array.isArray(modelOutput)) { // Fallback for direct array output // This condition `Array.isArray(modelOutput)` is unlikely if modelOutput is string, // so ensure `extractJsonFromMarkdown` is robust or `runInference` returns expected string } boxes = normalizeBoxes(boxes); setLatestBoxes(boxes); // Update state, triggers display canvas redraw setInferenceStatus(boxes.length > 0 ? "Inference complete. Boxes detected." : "Inference complete. No boxes detected."); } else { setInferenceStatus("Video not ready for processing."); } } catch (e) { setError("Inference error: " + (e instanceof Error ? e.message : String(e))); setLatestBoxes([]); setInferenceStatus("Inference failed."); } finally { setProcessingState(false); // Processing finished } }, 200); // Inference interval (e.g., 5 frames per second) }; const stopVLMProcessing = () => { if (interval) clearInterval(interval); interval = null; setProcessingState(false); setInferenceStatus("Stopped processing."); }; // Start/stop processing based on video playback events vlmVideo.addEventListener('play', startVLMProcessing); vlmVideo.addEventListener('pause', stopVLMProcessing); vlmVideo.addEventListener('ended', stopVLMProcessing); // Initial check if video is already playing (e.g., after initial load/autoplay) if (vlmVideo.readyState >= 2 && !vlmVideo.paused && !vlmVideo.ended) { startVLMProcessing(); } // Cleanup function for useEffect return () => { stopVLMProcessing(); vlmVideo.removeEventListener('play', startVLMProcessing); vlmVideo.removeEventListener('pause', stopVLMProcessing); vlmVideo.removeEventListener('ended', stopVLMProcessing); }; }, [mode, isLoaded, prompt, runInference, processingState, uploadedFile]); // Added uploadedFile for file mode re-trigger // --- Media Source Handling --- // Cleanup for media stream and object URLs const cleanupMediaSource = useCallback(() => { if (mediaStream) { mediaStream.getTracks().forEach(track => track.stop()); setMediaStream(null); } // Revoke any created blob URLs (for file inputs) if (displayVideoRef.current?.src.startsWith('blob:')) { URL.revokeObjectURL(displayVideoRef.current.src); displayVideoRef.current.src = ""; } if (vlmVideoRef.current?.src.startsWith('blob:')) { URL.revokeObjectURL(vlmVideoRef.current.src); vlmVideoRef.current.src = ""; } setLatestBoxes([]); // Clear boxes when source changes setError(null); setInferenceStatus(""); setDebugOutput(""); }, [mediaStream]); // Handle changing the mode (Webcam, URL, File) useEffect(() => { cleanupMediaSource(); // Clean up previous source const displayVideo = displayVideoRef.current; const vlmVideo = vlmVideoRef.current; if (!displayVideo || !vlmVideo) return; // Reset srcObject/src to ensure fresh start displayVideo.srcObject = null; vlmVideo.srcObject = null; displayVideo.src = ""; vlmVideo.src = ""; setLatestBoxes([]); // Clear boxes on mode change setError(null); setInferenceStatus(""); setDebugOutput(""); // Special handling for initial file mode to load example video if (mode === "File" && !uploadedFile) { displayVideo.src = EXAMPLE_VIDEO_URL; vlmVideo.src = EXAMPLE_VIDEO_URL; displayVideo.load(); vlmVideo.load(); // Load the video displayVideo.play().catch(e => console.error("Error playing example display video:", e)); vlmVideo.play().catch(e => console.error("Error playing example VLM video:", e)); } }, [mode, uploadedFile, cleanupMediaSource]); // Added uploadedFile to ensure re-trigger for file mode // Handle Webcam Input const handleWebcamInput = useCallback(async () => { cleanupMediaSource(); // Clean up any active stream try { const stream = await navigator.mediaDevices.getUserMedia({ video: true }); setMediaStream(stream); // Store stream to manage it if (displayVideoRef.current && vlmVideoRef.current) { displayVideoRef.current.srcObject = stream; vlmVideoRef.current.srcObject = stream; // Programmatically play both videos displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e)); vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e)); } setMode("Webcam"); } catch (e) { setError("Could not access webcam: " + (e instanceof Error ? e.message : String(e))); setMediaStream(null); setLatestBoxes([]); setInferenceStatus("Webcam access denied or failed."); } }, [cleanupMediaSource]); // Handle URL Input (when Load button is clicked) const handleLoadUrl = useCallback(() => { cleanupMediaSource(); // Clean up any active stream const url = currentUrlInput; if (!url) { setError("Please enter a valid URL."); return; } if (displayVideoRef.current && vlmVideoRef.current) { displayVideoRef.current.src = url; vlmVideoRef.current.src = url; displayVideoRef.current.load(); vlmVideoRef.current.load(); // Load the video displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e)); vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e)); setMode("URL"); } }, [currentUrlInput, cleanupMediaSource]); // Handle File Input const handleFileChange = useCallback((e: React.ChangeEvent) => { cleanupMediaSource(); // Clean up any active stream const file = e.target.files?.[0] || null; if (file) { const fileUrl = URL.createObjectURL(file); // Create blob URL for the file // Store the file to differentiate image/video and manage its URL setUploadedFile(file); if (isImageFile(file)) { // For images, we handle processing on a button click, not a continuous loop // The imageRef will display the image // The canvas will be used for processing and drawing setError(null); setMode("File"); } else if (isVideoFile(file)) { if (displayVideoRef.current && vlmVideoRef.current) { displayVideoRef.current.src = fileUrl; vlmVideoRef.current.src = fileUrl; displayVideoRef.current.load(); vlmVideoRef.current.load(); displayVideoRef.current.play().catch(e => console.error("Error playing display video:", e)); vlmVideoRef.current.play().catch(e => console.error("Error playing VLM video:", e)); setMode("File"); } } else { setError("Unsupported file type. Please upload an image or video."); setUploadedFile(null); if (fileUrl) URL.revokeObjectURL(fileUrl); // Clean up invalid file URL } } else { setUploadedFile(null); // Clear file if nothing selected // If no file selected, revert to example video if in File mode if (mode === "File") { if (displayVideoRef.current && vlmVideoRef.current) { displayVideoRef.current.src = EXAMPLE_VIDEO_URL; vlmVideoRef.current.src = EXAMPLE_VIDEO_URL; displayVideoRef.current.load(); vlmVideoRef.current.load(); displayVideoRef.current.play().catch(e => console.error("Error playing example display video:", e)); vlmVideoRef.current.play().catch(e => console.error("Error playing example VLM video:", e)); } } } }, [cleanupMediaSource, mode]); // Handler for processing an uploaded image file (one-time inference) const handleProcessImage = async () => { if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) { setError("Image or model not ready for processing."); return; } const img = imageRef.current; const canvas = canvasRef.current; const ctx = canvas.getContext("2d"); if (!ctx) return; // Ensure canvas dimensions match image for processing and display canvas.width = img.naturalWidth; canvas.height = img.naturalHeight; setProcessingState(true); setError(null); setInferenceStatus("Running image inference..."); try { // Draw image to canvas to get ImageData for inference ctx.drawImage(img, 0, 0, canvas.width, canvas.height); const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height); const modelOutput = await runInference(imageData, prompt); setDebugOutput(modelOutput); setInferenceStatus("Image inference complete."); // Clear canvas and redraw image before drawing boxes ctx.clearRect(0, 0, canvas.width, canvas.height); ctx.drawImage(img, 0, 0, canvas.width, canvas.height); let boxes = extractJsonFromMarkdown(modelOutput) || []; boxes = normalizeBoxes(boxes); setLatestBoxes(boxes); // Update latestBoxes for display if (boxes.length === 0) setInferenceStatus("Image inference complete. No boxes detected."); } catch (e) { setError("Image inference error: " + (e instanceof Error ? e.message : String(e))); setLatestBoxes([]); setInferenceStatus("Image inference failed."); } finally { setProcessingState(false); } }; // --- Rendered UI --- return (
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"}
{inferenceStatus}
{/* Adjusted top margin */}
{/* Added padding */} {/* Mode Selector */}
{/* Increased margin-top for selector */}
{MODES.map((m) => ( ))}
{/* Dynamic Content Area */}
{/* Prompt Input (Common to all modes) */}