import gradio as gr
import cv2
import numpy as np
import torch
from transformers import ViTForImageClassification, ViTImageProcessor
import mediapipe as mp

# Load pretrained model (Sign Language Recognition)
model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
model = ViTForImageClassification.from_pretrained(model_name)
processor = ViTImageProcessor.from_pretrained(model_name)

# MediaPipe Hands setup
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)

# Get the model's label mappings
asl_words = model.config.id2label  # Dictionary mapping index to ASL words

def extract_hand_landmarks(image):
    """Extracts hand landmarks from an image."""
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image_rgb)
    
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()
            return landmarks
    return None

def classify_asl_word(image):
    """Predicts the ASL word from a hand gesture."""
    landmarks = extract_hand_landmarks(image)
    if landmarks is None:
        return "No hand detected"
    
    # Convert image into a format suitable for the model
    inputs = processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class_idx = outputs.logits.argmax(-1).item()
    
    return asl_words.get(predicted_class_idx, "Unknown sign")

def process_video(video_path):
    """Processes the uploaded video and returns detected ASL words."""
    cap = cv2.VideoCapture(video_path)
    detected_words = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        word = classify_asl_word(frame)
        if word not in detected_words and word != "No hand detected":
            detected_words.append(word)

    cap.release()
    return ", ".join(detected_words) if detected_words else "No ASL words detected"

# Gradio Interface
iface = gr.Interface(
    fn=process_video,
    inputs=gr.Video(type="file"),
    outputs=gr.Textbox(label="Detected ASL Words"),
    title="ASL Sign Language to Text",
    description="Upload a video of ASL signs, and the model will translate them into text."
)

if __name__ == "__main__":
    iface.launch()