import torch
import gradio as gr
import cv2
import numpy as np
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor

# Model name
model_name = "MCG-NJU/videomae-base"  # Ensure this is a valid model on Hugging Face

# Load model and processor
model = VideoMAEForVideoClassification.from_pretrained(model_name)
processor = VideoMAEImageProcessor.from_pretrained(model_name)

# Function to extract frames from video
def extract_frames(video_path, num_frames=16):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Select frames evenly spaced throughout the video
    for i in np.linspace(0, total_frames - 1, num_frames, dtype=int):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    
    cap.release()
    
    # Ensure exactly `num_frames` frames are used
    while len(frames) < num_frames:
        frames.append(frames[-1])  # Duplicate last frame if needed
    
    return frames

# Function to process video and make predictions
def process_video(video):
    frames = extract_frames(video)

    # Process video frames with correct resizing and normalization
    inputs = processor(frames, return_tensors="pt", sampling_rate=30, do_resize=True, size={"shortest_edge": 224}, do_normalize=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    return f"Predicted Class: {predicted_class}"

# Gradio UI
iface = gr.Interface(
    fn=process_video,
    inputs=gr.Video(label="Upload a video"),
    outputs=gr.Textbox(label="Prediction"),
)

# Launch app
iface.launch(server_name="0.0.0.0", server_port=7860, share=True)