import gradio as gr import torch import cv2 import os import numpy as np from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor # Load a lighter pretrained model model_name = "facebook/videomae-base" model = VideoMAEForVideoClassification.from_pretrained(model_name) processor = VideoMAEImageProcessor.from_pretrained(model_name) # Reduce frames for faster processing def preprocess_video(video_path): cap = cv2.VideoCapture(video_path) frames = [] frame_skip = 5 # Skip every 5 frames to speed up processing count = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break if count % frame_skip == 0: frame = cv2.resize(frame, (224, 224)) # Resize to match model input frames.append(frame) count += 1 cap.release() return frames # Function to predict sign language words def predict(video_path): frames = preprocess_video(video_path) if len(frames) == 0: return "No frames detected, try a different video." inputs = processor(images=frames, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits predicted_class_idx = logits.argmax(-1).item() # Mapping to common words (example, update with real labels) labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"] predicted_label = labels[predicted_class_idx % len(labels)] # Placeholder mapping return predicted_label iface = gr.Interface( fn=predict, inputs=gr.Video(), outputs=gr.Textbox(label="Predicted Sign"), title="Sign Language to Text Converter", description="Upload a video of a hand gesture and get the predicted word." ) if __name__ == "__main__": iface.launch()