AreebKhan commited on
Commit
59d757a
·
verified ·
1 Parent(s): ae4a851

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -54
app.py CHANGED
@@ -1,73 +1,51 @@
1
  import gradio as gr
 
2
  import cv2
3
  import numpy as np
4
- import torch
5
- from transformers import ViTForImageClassification, ViTImageProcessor
6
- import mediapipe as mp
7
 
8
- # Load pretrained model (Sign Language Recognition)
9
  model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
10
- model = ViTForImageClassification.from_pretrained(model_name)
11
- processor = ViTImageProcessor.from_pretrained(model_name)
12
 
13
- # MediaPipe Hands setup
14
- mp_hands = mp.solutions.hands
15
- mp_drawing = mp.solutions.drawing_utils
16
- hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
 
 
 
 
 
 
 
17
 
18
- # Get the model's label mappings
19
- asl_words = model.config.id2label # Dictionary mapping index to ASL words
20
 
21
- def extract_hand_landmarks(image):
22
- """Extracts hand landmarks from an image."""
23
- image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
24
- results = hands.process(image_rgb)
25
-
26
- if results.multi_hand_landmarks:
27
- for hand_landmarks in results.multi_hand_landmarks:
28
- landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()
29
- return landmarks
30
- return None
31
 
32
- def classify_asl_word(image):
33
- """Predicts the ASL word from a hand gesture."""
34
- landmarks = extract_hand_landmarks(image)
35
- if landmarks is None:
36
- return "No hand detected"
37
-
38
- # Convert image into a format suitable for the model
39
- inputs = processor(images=image, return_tensors="pt")
40
 
41
  with torch.no_grad():
42
  outputs = model(**inputs)
43
- predicted_class_idx = outputs.logits.argmax(-1).item()
44
-
45
- return asl_words.get(predicted_class_idx, "Unknown sign")
46
 
47
- def process_video(video_path):
48
- """Processes the uploaded video and returns detected ASL words."""
49
- cap = cv2.VideoCapture(video_path)
50
- detected_words = []
51
 
52
- while cap.isOpened():
53
- ret, frame = cap.read()
54
- if not ret:
55
- break
56
- word = classify_asl_word(frame)
57
- if word not in detected_words and word != "No hand detected":
58
- detected_words.append(word)
59
-
60
- cap.release()
61
- return ", ".join(detected_words) if detected_words else "No ASL words detected"
62
 
63
- # Gradio Interface
64
  iface = gr.Interface(
65
- fn=process_video,
66
- inputs=gr.Video(type="file"),
67
- outputs=gr.Textbox(label="Detected ASL Words"),
68
- title="ASL Sign Language to Text",
69
- description="Upload a video of ASL signs, and the model will translate them into text."
70
  )
71
 
72
  if __name__ == "__main__":
73
- iface.launch()
 
1
  import gradio as gr
2
+ import torch
3
  import cv2
4
  import numpy as np
5
+ from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
 
 
6
 
7
+ # Load the pretrained model (VideoMAE)
8
  model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
9
+ model = VideoMAEForVideoClassification.from_pretrained(model_name)
10
+ processor = VideoMAEImageProcessor.from_pretrained(model_name)
11
 
12
+ # Function to process video frames and make predictions
13
+ def predict(video_path):
14
+ cap = cv2.VideoCapture(video_path)
15
+ frames = []
16
+
17
+ while cap.isOpened():
18
+ ret, frame = cap.read()
19
+ if not ret:
20
+ break
21
+ frame = cv2.resize(frame, (224, 224)) # Resize for model compatibility
22
+ frames.append(frame)
23
 
24
+ cap.release()
 
25
 
26
+ if len(frames) == 0:
27
+ return "No frames detected in video!"
 
 
 
 
 
 
 
 
28
 
29
+ # Convert frames to tensor
30
+ inputs = processor(images=frames, return_tensors="pt")
 
 
 
 
 
 
31
 
32
  with torch.no_grad():
33
  outputs = model(**inputs)
 
 
 
34
 
35
+ logits = outputs.logits
36
+ predicted_class_idx = logits.argmax(-1).item()
37
+ predicted_label = model.config.id2label[predicted_class_idx] # Convert index to label
 
38
 
39
+ return f"Predicted Sign: {predicted_label}"
 
 
 
 
 
 
 
 
 
40
 
41
+ # Gradio UI
42
  iface = gr.Interface(
43
+ fn=predict,
44
+ inputs=gr.Video(),
45
+ outputs=gr.Textbox(label="Recognized Sign"),
46
+ title="Sign Language Translator",
47
+ description="Upload a video of a hand gesture, and the model will predict the corresponding sign."
48
  )
49
 
50
  if __name__ == "__main__":
51
+ iface.launch(debug=True)