Spaces:

AreebKhan
/

Sign_Language_Translator2

Sleeping

App Files Files Community

AreebKhan commited on Feb 22

Commit

c8bf851

verified ·

1 Parent(s): 8f985e9

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -38

app.py CHANGED Viewed

@@ -1,62 +1,59 @@
 import gradio as gr
 import torch
 import cv2
-import os
 import numpy as np
-from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
-# Load a lighter pretrained model
-model_name = "facebook/videomae-base"
-model = VideoMAEForVideoClassification.from_pretrained(model_name)
-processor = VideoMAEImageProcessor.from_pretrained(model_name)
-# Reduce frames for faster processing
-def preprocess_video(video_path):
     cap = cv2.VideoCapture(video_path)
     frames = []
-    frame_skip = 5  # Skip every 5 frames to speed up processing
-    count = 0
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
-        if count % frame_skip == 0:
-            frame = cv2.resize(frame, (224, 224))  # Resize to match model input
-            frames.append(frame)
-        count += 1
     cap.release()
-    return frames
-# Function to predict sign language words
-def predict(video_path):
-    frames = preprocess_video(video_path)
     if len(frames) == 0:
-        return "No frames detected, try a different video."
-    inputs = processor(images=frames, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
-    logits = outputs.logits
-    predicted_class_idx = logits.argmax(-1).item()
-    # Mapping to common words (example, update with real labels)
-    labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"]
-    predicted_label = labels[predicted_class_idx % len(labels)]  # Placeholder mapping
-    return predicted_label
-iface = gr.Interface(
-    fn=predict,
-    inputs=gr.Video(),
-    outputs=gr.Textbox(label="Predicted Sign"),
-    title="Sign Language to Text Converter",
-    description="Upload a video of a hand gesture and get the predicted word."
-)
-if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import torch
+from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor
 import cv2
 import numpy as np
+import tempfile
+import os
+# Load the pre-trained model
+model_name = "Sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+feature_extractor = VideoMAEFeatureExtractor.from_pretrained(model_name)
+model = VideoMAEForVideoClassification.from_pretrained(model_name).to(device)
+def process_video(video_path):
+    """Processes video and predicts sign language word."""
+    if not os.path.exists(video_path):
+        return "Error: Video file not found"
+    # Read video
     cap = cv2.VideoCapture(video_path)
     frames = []
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames.append(frame)
     cap.release()
     if len(frames) == 0:
+        return "Error: No frames extracted from the video"
+    # Preprocess frames
+    inputs = feature_extractor(frames, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Get predictions
     with torch.no_grad():
         outputs = model(**inputs)
+    predicted_class = outputs.logits.argmax(-1).item()
+    class_labels = model.config.id2label  # Map predictions to words
+    return f"Predicted word: {class_labels.get(predicted_class, 'Unknown')}"
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## Sign Language to Text Recognition")
+    video_input = gr.Video(label="Upload a sign language video")
+    output_text = gr.Textbox(label="Predicted Word")
+    btn = gr.Button("Predict")
+    btn.click(fn=process_video, inputs=video_input, outputs=output_text)
+demo.launch()