AreebKhan commited on
Commit
c8bf851
·
verified ·
1 Parent(s): 8f985e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -38
app.py CHANGED
@@ -1,62 +1,59 @@
1
  import gradio as gr
2
  import torch
 
3
  import cv2
4
- import os
5
  import numpy as np
6
- from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
 
 
 
 
 
 
 
 
7
 
8
- # Load a lighter pretrained model
9
- model_name = "facebook/videomae-base"
10
- model = VideoMAEForVideoClassification.from_pretrained(model_name)
11
- processor = VideoMAEImageProcessor.from_pretrained(model_name)
12
 
13
- # Reduce frames for faster processing
14
- def preprocess_video(video_path):
15
  cap = cv2.VideoCapture(video_path)
16
  frames = []
17
- frame_skip = 5 # Skip every 5 frames to speed up processing
18
 
19
- count = 0
20
  while cap.isOpened():
21
  ret, frame = cap.read()
22
  if not ret:
23
  break
24
- if count % frame_skip == 0:
25
- frame = cv2.resize(frame, (224, 224)) # Resize to match model input
26
- frames.append(frame)
27
- count += 1
28
-
29
  cap.release()
30
- return frames
31
-
32
- # Function to predict sign language words
33
- def predict(video_path):
34
- frames = preprocess_video(video_path)
35
 
36
  if len(frames) == 0:
37
- return "No frames detected, try a different video."
38
 
39
- inputs = processor(images=frames, return_tensors="pt")
 
 
40
 
 
41
  with torch.no_grad():
42
  outputs = model(**inputs)
 
 
 
43
 
44
- logits = outputs.logits
45
- predicted_class_idx = logits.argmax(-1).item()
46
-
47
- # Mapping to common words (example, update with real labels)
48
- labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"]
49
- predicted_label = labels[predicted_class_idx % len(labels)] # Placeholder mapping
50
 
51
- return predicted_label
 
 
 
 
 
52
 
53
- iface = gr.Interface(
54
- fn=predict,
55
- inputs=gr.Video(),
56
- outputs=gr.Textbox(label="Predicted Sign"),
57
- title="Sign Language to Text Converter",
58
- description="Upload a video of a hand gesture and get the predicted word."
59
- )
60
 
61
- if __name__ == "__main__":
62
- iface.launch()
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor
4
  import cv2
 
5
  import numpy as np
6
+ import tempfile
7
+ import os
8
+
9
+ # Load the pre-trained model
10
+ model_name = "Sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ feature_extractor = VideoMAEFeatureExtractor.from_pretrained(model_name)
14
+ model = VideoMAEForVideoClassification.from_pretrained(model_name).to(device)
15
 
16
+ def process_video(video_path):
17
+ """Processes video and predicts sign language word."""
18
+ if not os.path.exists(video_path):
19
+ return "Error: Video file not found"
20
 
21
+ # Read video
 
22
  cap = cv2.VideoCapture(video_path)
23
  frames = []
 
24
 
 
25
  while cap.isOpened():
26
  ret, frame = cap.read()
27
  if not ret:
28
  break
29
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
30
+ frames.append(frame)
31
+
 
 
32
  cap.release()
 
 
 
 
 
33
 
34
  if len(frames) == 0:
35
+ return "Error: No frames extracted from the video"
36
 
37
+ # Preprocess frames
38
+ inputs = feature_extractor(frames, return_tensors="pt")
39
+ inputs = {k: v.to(device) for k, v in inputs.items()}
40
 
41
+ # Get predictions
42
  with torch.no_grad():
43
  outputs = model(**inputs)
44
+
45
+ predicted_class = outputs.logits.argmax(-1).item()
46
+ class_labels = model.config.id2label # Map predictions to words
47
 
48
+ return f"Predicted word: {class_labels.get(predicted_class, 'Unknown')}"
 
 
 
 
 
49
 
50
+ # Gradio UI
51
+ with gr.Blocks() as demo:
52
+ gr.Markdown("## Sign Language to Text Recognition")
53
+ video_input = gr.Video(label="Upload a sign language video")
54
+ output_text = gr.Textbox(label="Predicted Word")
55
+ btn = gr.Button("Predict")
56
 
57
+ btn.click(fn=process_video, inputs=video_input, outputs=output_text)
 
 
 
 
 
 
58
 
59
+ demo.launch()