Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import cv2 | |
| import os | |
| import numpy as np | |
| from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor | |
| # Load a lighter pretrained model | |
| model_name = "facebook/videomae-base" | |
| model = VideoMAEForVideoClassification.from_pretrained(model_name) | |
| processor = VideoMAEImageProcessor.from_pretrained(model_name) | |
| # Reduce frames for faster processing | |
| def preprocess_video(video_path): | |
| cap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| frame_skip = 5 # Skip every 5 frames to speed up processing | |
| count = 0 | |
| while cap.isOpened(): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| if count % frame_skip == 0: | |
| frame = cv2.resize(frame, (224, 224)) # Resize to match model input | |
| frames.append(frame) | |
| count += 1 | |
| cap.release() | |
| return frames | |
| # Function to predict sign language words | |
| def predict(video_path): | |
| frames = preprocess_video(video_path) | |
| if len(frames) == 0: | |
| return "No frames detected, try a different video." | |
| inputs = processor(images=frames, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| logits = outputs.logits | |
| predicted_class_idx = logits.argmax(-1).item() | |
| # Mapping to common words (example, update with real labels) | |
| labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"] | |
| predicted_label = labels[predicted_class_idx % len(labels)] # Placeholder mapping | |
| return predicted_label | |
| iface = gr.Interface( | |
| fn=predict, | |
| inputs=gr.Video(), | |
| outputs=gr.Textbox(label="Predicted Sign"), | |
| title="Sign Language to Text Converter", | |
| description="Upload a video of a hand gesture and get the predicted word." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |