AreebKhan's picture
Update app.py
86ac4de verified
raw
history blame
1.8 kB
import gradio as gr
import torch
import cv2
import os
import numpy as np
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
# Load a lighter pretrained model
model_name = "facebook/videomae-base"
model = VideoMAEForVideoClassification.from_pretrained(model_name)
processor = VideoMAEImageProcessor.from_pretrained(model_name)
# Reduce frames for faster processing
def preprocess_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
frame_skip = 5 # Skip every 5 frames to speed up processing
count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if count % frame_skip == 0:
frame = cv2.resize(frame, (224, 224)) # Resize to match model input
frames.append(frame)
count += 1
cap.release()
return frames
# Function to predict sign language words
def predict(video_path):
frames = preprocess_video(video_path)
if len(frames) == 0:
return "No frames detected, try a different video."
inputs = processor(images=frames, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_class_idx = logits.argmax(-1).item()
# Mapping to common words (example, update with real labels)
labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"]
predicted_label = labels[predicted_class_idx % len(labels)] # Placeholder mapping
return predicted_label
iface = gr.Interface(
fn=predict,
inputs=gr.Video(),
outputs=gr.Textbox(label="Predicted Sign"),
title="Sign Language to Text Converter",
description="Upload a video of a hand gesture and get the predicted word."
)
if __name__ == "__main__":
iface.launch()