Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import cv2 | |
| import numpy as np | |
| import torch | |
| from transformers import ViTForImageClassification, ViTImageProcessor | |
| import mediapipe as mp | |
| # Load pretrained model (Sign Language Recognition) | |
| model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition" | |
| model = ViTForImageClassification.from_pretrained(model_name) | |
| processor = ViTImageProcessor.from_pretrained(model_name) | |
| # MediaPipe Hands setup | |
| mp_hands = mp.solutions.hands | |
| mp_drawing = mp.solutions.drawing_utils | |
| hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5) | |
| # Get the model's label mappings | |
| asl_words = model.config.id2label # Dictionary mapping index to ASL words | |
| def extract_hand_landmarks(image): | |
| """Extracts hand landmarks from an image.""" | |
| image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| results = hands.process(image_rgb) | |
| if results.multi_hand_landmarks: | |
| for hand_landmarks in results.multi_hand_landmarks: | |
| landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten() | |
| return landmarks | |
| return None | |
| def classify_asl_word(image): | |
| """Predicts the ASL word from a hand gesture.""" | |
| landmarks = extract_hand_landmarks(image) | |
| if landmarks is None: | |
| return "No hand detected" | |
| # Convert image into a format suitable for the model | |
| inputs = processor(images=image, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predicted_class_idx = outputs.logits.argmax(-1).item() | |
| return asl_words.get(predicted_class_idx, "Unknown sign") | |
| def process_video(video_path): | |
| """Processes the uploaded video and returns detected ASL words.""" | |
| cap = cv2.VideoCapture(video_path) | |
| detected_words = [] | |
| while cap.isOpened(): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| word = classify_asl_word(frame) | |
| if word not in detected_words and word != "No hand detected": | |
| detected_words.append(word) | |
| cap.release() | |
| return ", ".join(detected_words) if detected_words else "No ASL words detected" | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=process_video, | |
| inputs=gr.Video(type="file"), | |
| outputs=gr.Textbox(label="Detected ASL Words"), | |
| title="ASL Sign Language to Text", | |
| description="Upload a video of ASL signs, and the model will translate them into text." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |