Spaces:

AreebKhan
/

Sign_Language_Translator2

Sleeping

App Files Files Community

AreebKhan commited on Feb 22

Commit

59d757a

verified ·

1 Parent(s): ae4a851

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -54

app.py CHANGED Viewed

@@ -1,73 +1,51 @@
 import gradio as gr
 import cv2
 import numpy as np
-import torch
-from transformers import ViTForImageClassification, ViTImageProcessor
-import mediapipe as mp
-# Load pretrained model (Sign Language Recognition)
 model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
-model = ViTForImageClassification.from_pretrained(model_name)
-processor = ViTImageProcessor.from_pretrained(model_name)
-# MediaPipe Hands setup
-mp_hands = mp.solutions.hands
-mp_drawing = mp.solutions.drawing_utils
-hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
-# Get the model's label mappings
-asl_words = model.config.id2label  # Dictionary mapping index to ASL words
-def extract_hand_landmarks(image):
-    """Extracts hand landmarks from an image."""
-    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    results = hands.process(image_rgb)
-    if results.multi_hand_landmarks:
-        for hand_landmarks in results.multi_hand_landmarks:
-            landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()
-            return landmarks
-    return None
-def classify_asl_word(image):
-    """Predicts the ASL word from a hand gesture."""
-    landmarks = extract_hand_landmarks(image)
-    if landmarks is None:
-        return "No hand detected"
-    # Convert image into a format suitable for the model
-    inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
-    predicted_class_idx = outputs.logits.argmax(-1).item()
-    return asl_words.get(predicted_class_idx, "Unknown sign")
-def process_video(video_path):
-    """Processes the uploaded video and returns detected ASL words."""
-    cap = cv2.VideoCapture(video_path)
-    detected_words = []
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-        word = classify_asl_word(frame)
-        if word not in detected_words and word != "No hand detected":
-            detected_words.append(word)
-    cap.release()
-    return ", ".join(detected_words) if detected_words else "No ASL words detected"
-# Gradio Interface
 iface = gr.Interface(
-    fn=process_video,
-    inputs=gr.Video(type="file"),
-    outputs=gr.Textbox(label="Detected ASL Words"),
-    title="ASL Sign Language to Text",
-    description="Upload a video of ASL signs, and the model will translate them into text."
 )
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+import torch
 import cv2
 import numpy as np
+from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
+# Load the pretrained model (VideoMAE)
 model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
+model = VideoMAEForVideoClassification.from_pretrained(model_name)
+processor = VideoMAEImageProcessor.from_pretrained(model_name)
+# Function to process video frames and make predictions
+def predict(video_path):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.resize(frame, (224, 224))  # Resize for model compatibility
+        frames.append(frame)
+    cap.release()
+    if len(frames) == 0:
+        return "No frames detected in video!"
+    # Convert frames to tensor
+    inputs = processor(images=frames, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
+    logits = outputs.logits
+    predicted_class_idx = logits.argmax(-1).item()
+    predicted_label = model.config.id2label[predicted_class_idx]  # Convert index to label
+    return f"Predicted Sign: {predicted_label}"
+# Gradio UI
 iface = gr.Interface(
+    fn=predict,
+    inputs=gr.Video(),
+    outputs=gr.Textbox(label="Recognized Sign"),
+    title="Sign Language Translator",
+    description="Upload a video of a hand gesture, and the model will predict the corresponding sign."
 )
 if __name__ == "__main__":
+    iface.launch(debug=True)