Spaces:
Sleeping
Sleeping
File size: 2,483 Bytes
4d449f1 ae4a851 4d449f1 ae4a851 4d449f1 ae4a851 4d449f1 ae4a851 4d449f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
import cv2
import numpy as np
import torch
from transformers import ViTForImageClassification, ViTImageProcessor
import mediapipe as mp
# Load pretrained model (Sign Language Recognition)
model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
model = ViTForImageClassification.from_pretrained(model_name)
processor = ViTImageProcessor.from_pretrained(model_name)
# MediaPipe Hands setup
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
# Get the model's label mappings
asl_words = model.config.id2label # Dictionary mapping index to ASL words
def extract_hand_landmarks(image):
"""Extracts hand landmarks from an image."""
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = hands.process(image_rgb)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()
return landmarks
return None
def classify_asl_word(image):
"""Predicts the ASL word from a hand gesture."""
landmarks = extract_hand_landmarks(image)
if landmarks is None:
return "No hand detected"
# Convert image into a format suitable for the model
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
predicted_class_idx = outputs.logits.argmax(-1).item()
return asl_words.get(predicted_class_idx, "Unknown sign")
def process_video(video_path):
"""Processes the uploaded video and returns detected ASL words."""
cap = cv2.VideoCapture(video_path)
detected_words = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
word = classify_asl_word(frame)
if word not in detected_words and word != "No hand detected":
detected_words.append(word)
cap.release()
return ", ".join(detected_words) if detected_words else "No ASL words detected"
# Gradio Interface
iface = gr.Interface(
fn=process_video,
inputs=gr.Video(type="file"),
outputs=gr.Textbox(label="Detected ASL Words"),
title="ASL Sign Language to Text",
description="Upload a video of ASL signs, and the model will translate them into text."
)
if __name__ == "__main__":
iface.launch()
|