import gradio as gr import cv2 import numpy as np import torch from transformers import ViTForImageClassification, ViTImageProcessor import mediapipe as mp # Load pretrained model (Sign Language Recognition) model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition" model = ViTForImageClassification.from_pretrained(model_name) processor = ViTImageProcessor.from_pretrained(model_name) # MediaPipe Hands setup mp_hands = mp.solutions.hands mp_drawing = mp.solutions.drawing_utils hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5) # Get the model's label mappings asl_words = model.config.id2label # Dictionary mapping index to ASL words def extract_hand_landmarks(image): """Extracts hand landmarks from an image.""" image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) results = hands.process(image_rgb) if results.multi_hand_landmarks: for hand_landmarks in results.multi_hand_landmarks: landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten() return landmarks return None def classify_asl_word(image): """Predicts the ASL word from a hand gesture.""" landmarks = extract_hand_landmarks(image) if landmarks is None: return "No hand detected" # Convert image into a format suitable for the model inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) predicted_class_idx = outputs.logits.argmax(-1).item() return asl_words.get(predicted_class_idx, "Unknown sign") def process_video(video_path): """Processes the uploaded video and returns detected ASL words.""" cap = cv2.VideoCapture(video_path) detected_words = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break word = classify_asl_word(frame) if word not in detected_words and word != "No hand detected": detected_words.append(word) cap.release() return ", ".join(detected_words) if detected_words else "No ASL words detected" # Gradio Interface iface = gr.Interface( fn=process_video, inputs=gr.Video(type="file"), outputs=gr.Textbox(label="Detected ASL Words"), title="ASL Sign Language to Text", description="Upload a video of ASL signs, and the model will translate them into text." ) if __name__ == "__main__": iface.launch()