AreebKhan's picture
Update app.py
ae4a851 verified
raw
history blame
2.48 kB
import gradio as gr
import cv2
import numpy as np
import torch
from transformers import ViTForImageClassification, ViTImageProcessor
import mediapipe as mp
# Load pretrained model (Sign Language Recognition)
model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
model = ViTForImageClassification.from_pretrained(model_name)
processor = ViTImageProcessor.from_pretrained(model_name)
# MediaPipe Hands setup
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
# Get the model's label mappings
asl_words = model.config.id2label # Dictionary mapping index to ASL words
def extract_hand_landmarks(image):
"""Extracts hand landmarks from an image."""
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = hands.process(image_rgb)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()
return landmarks
return None
def classify_asl_word(image):
"""Predicts the ASL word from a hand gesture."""
landmarks = extract_hand_landmarks(image)
if landmarks is None:
return "No hand detected"
# Convert image into a format suitable for the model
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
predicted_class_idx = outputs.logits.argmax(-1).item()
return asl_words.get(predicted_class_idx, "Unknown sign")
def process_video(video_path):
"""Processes the uploaded video and returns detected ASL words."""
cap = cv2.VideoCapture(video_path)
detected_words = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
word = classify_asl_word(frame)
if word not in detected_words and word != "No hand detected":
detected_words.append(word)
cap.release()
return ", ".join(detected_words) if detected_words else "No ASL words detected"
# Gradio Interface
iface = gr.Interface(
fn=process_video,
inputs=gr.Video(type="file"),
outputs=gr.Textbox(label="Detected ASL Words"),
title="ASL Sign Language to Text",
description="Upload a video of ASL signs, and the model will translate them into text."
)
if __name__ == "__main__":
iface.launch()