import gradio as gr
import torch
import cv2
import os
import numpy as np
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor

# Load a lighter pretrained model
model_name = "facebook/videomae-base"
model = VideoMAEForVideoClassification.from_pretrained(model_name)
processor = VideoMAEImageProcessor.from_pretrained(model_name)

# Reduce frames for faster processing
def preprocess_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_skip = 5  # Skip every 5 frames to speed up processing

    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_skip == 0:
            frame = cv2.resize(frame, (224, 224))  # Resize to match model input
            frames.append(frame)
        count += 1

    cap.release()
    return frames

# Function to predict sign language words
def predict(video_path):
    frames = preprocess_video(video_path)

    if len(frames) == 0:
        return "No frames detected, try a different video."

    inputs = processor(images=frames, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class_idx = logits.argmax(-1).item()

    # Mapping to common words (example, update with real labels)
    labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"]
    predicted_label = labels[predicted_class_idx % len(labels)]  # Placeholder mapping

    return predicted_label

iface = gr.Interface(
    fn=predict,
    inputs=gr.Video(),
    outputs=gr.Textbox(label="Predicted Sign"),
    title="Sign Language to Text Converter",
    description="Upload a video of a hand gesture and get the predicted word."
)

if __name__ == "__main__":
    iface.launch()