Kousik Kumar Siddavaram commited on
Commit
bf73c48
·
1 Parent(s): a35f4a3

Using trpakov/vit-face-expression model for expression recognition

Browse files
app/Hackathon_setup/exp_recognition.py CHANGED
@@ -1,146 +1,86 @@
1
- import torch.nn as nn
2
- import torchvision.transforms as transforms
3
- from PIL import Image
4
  import numpy as np
5
  import cv2
6
- from matplotlib import pyplot as plt
7
  import torch
8
- # In the below line,remove '.' while working on your local system.However Make sure that '.' is present before face_recognition_model while uploading to the server, Do not remove it.
9
- from .exp_recognition_model import *
10
  from PIL import Image
11
  import base64
12
  import io
13
  import os
14
 
15
- ####################################################################################################################
16
- # Expression Recognition Model Definition and Loader
17
- ####################################################################################################################
18
-
19
- # Same class order as used in training
20
- classes = ['ANGER', 'DISGUST', 'FEAR', 'HAPPINESS', 'NEUTRAL', 'SADNESS', 'SURPRISE']
21
-
22
- # Define CNN architecture (must match training)
23
- class ExpressionCNN(nn.Module):
24
- def __init__(self):
25
- super(ExpressionCNN, self).__init__()
26
- self.features = nn.Sequential(
27
- nn.Conv2d(1, 32, 3, padding=1),
28
- nn.BatchNorm2d(32),
29
- nn.ReLU(),
30
- nn.MaxPool2d(2, 2),
31
- nn.Dropout(0.25),
32
-
33
- nn.Conv2d(32, 64, 3, padding=1),
34
- nn.BatchNorm2d(64),
35
- nn.ReLU(),
36
- nn.MaxPool2d(2, 2),
37
- nn.Dropout(0.25),
38
-
39
- nn.Conv2d(64, 128, 3, padding=1),
40
- nn.BatchNorm2d(128),
41
- nn.ReLU(),
42
- nn.MaxPool2d(2, 2)
43
- )
44
-
45
- self.classifier = nn.Sequential(
46
- nn.Linear(128 * 16 * 16, 256),
47
- nn.BatchNorm1d(256),
48
- nn.ReLU(),
49
- nn.Dropout(0.5),
50
- nn.Linear(256, len(classes))
51
- )
52
-
53
- def forward(self, x):
54
- x = self.features(x)
55
- x = x.view(x.size(0), -1)
56
- return self.classifier(x)
57
-
58
- # Create model instance
59
- expression_model = ExpressionCNN()
60
-
61
- # Resolve absolute path to .t7 file
62
  current_path = os.path.dirname(os.path.abspath(__file__))
63
- checkpoint_path = os.path.join(current_path, 'expression_model.t7')
64
-
65
- # Load checkpoint safely
66
- try:
67
- if os.path.exists(checkpoint_path):
68
- checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
69
-
70
- if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
71
- expression_model.load_state_dict(checkpoint['model_state_dict'])
72
- print("Model loaded successfully (wrapped in dict).")
73
- elif isinstance(checkpoint, dict):
74
- expression_model.load_state_dict(checkpoint)
75
- print("Model loaded successfully (state_dict).")
76
- else:
77
- expression_model = checkpoint
78
- print("Entire model object loaded directly.")
79
- expression_model.eval()
80
- else:
81
- print(f"Model file not found at: {checkpoint_path}")
82
- except Exception as e:
83
- print(f"Model Load Error: {e}")
84
- expression_model = None
85
-
86
- ####################################################################################################################
87
- # Face Detection Helper
88
- ####################################################################################################################
89
  def detected_face(image):
90
  """
91
- Detect faces in the image and return the largest detected face.
92
- If no face is detected, return 0.
93
  """
94
- face_haar = os.path.join(current_path, 'haarcascade_frontalface_default.xml')
95
- eye_haar = os.path.join(current_path, 'haarcascade_eye.xml')
96
  face_cascade = cv2.CascadeClassifier(face_haar)
97
  eye_cascade = cv2.CascadeClassifier(eye_haar)
98
 
99
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
100
- faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)
101
-
102
  if len(faces) == 0:
103
- return 0 # No face detected
104
-
105
- # Select the face with largest area
106
- face_areas = [w*h for (x,y,w,h) in faces]
107
- max_idx = np.argmax(face_areas)
108
- x, y, w, h = faces[max_idx]
109
- face_cropped = gray[y:y+h, x:x+w]
110
- return Image.fromarray(face_cropped)
111
-
112
- ####################################################################################################################
113
- # Main Expression Recognition Function
114
- ####################################################################################################################
 
 
 
 
 
115
  def get_expression(img):
116
  """
117
- Input: img -> OpenCV BGR image
118
- Output: Predicted expression as string
119
  """
120
- if expression_model is None:
121
- return "Model not loaded"
122
 
123
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
124
- expression_model.to(device)
125
- expression_model.eval()
126
 
127
  # Detect face
128
  face = detected_face(img)
129
  if face == 0:
130
- face = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
131
-
132
- # Transform pipeline must match training
133
- transform = transforms.Compose([
134
- transforms.Resize((128, 128)),
135
- transforms.Grayscale(num_output_channels=1),
136
- transforms.ToTensor(),
137
- transforms.Normalize(mean=[0.5], std=[0.5])
138
- ])
139
- input_tensor = transform(face).unsqueeze(0).to(device)
140
-
141
- # Predict expression
142
  with torch.no_grad():
143
- outputs = expression_model(input_tensor)
144
- _, pred = torch.max(outputs, 1)
 
 
 
 
 
145
 
146
- return classes[pred.item()]
 
 
1
+
 
 
2
  import numpy as np
3
  import cv2
 
4
  import torch
5
+ from matplotlib import pyplot as plt
 
6
  from PIL import Image
7
  import base64
8
  import io
9
  import os
10
 
11
+ # Remove '.' if running locally; keep it for server (Spaces)
12
+ from .exp_recognition_model import facExpRec, processor, device
13
+
14
+ #############################################################################################################################
15
+ # Caution: Don't change any of the filenames, function names and definitions #
16
+ # Always use the current_path + file_name for referring any files, without it we cannot access files on the server #
17
+ #############################################################################################################################
18
+
19
+ # Current path stores absolute path of the file from where it runs.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  current_path = os.path.dirname(os.path.abspath(__file__))
21
+
22
+ # =====================================================
23
+ # FACE DETECTION FUNCTION
24
+ # =====================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def detected_face(image):
26
  """
27
+ Detects faces using Haar cascades and returns the face with the largest area.
28
+ Returns 0 if no face detected.
29
  """
30
+ eye_haar = current_path + '/haarcascade_eye.xml'
31
+ face_haar = current_path + '/haarcascade_frontalface_default.xml'
32
  face_cascade = cv2.CascadeClassifier(face_haar)
33
  eye_cascade = cv2.CascadeClassifier(eye_haar)
34
 
35
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
36
+ faces = face_cascade.detectMultiScale(gray, 1.3, 5)
37
+
38
  if len(faces) == 0:
39
+ return 0
40
+
41
+ face_areas = []
42
+ images = []
43
+ for (x, y, w, h) in faces:
44
+ face_cropped = gray[y:y + h, x:x + w]
45
+ face_areas.append(w * h)
46
+ images.append(face_cropped)
47
+
48
+ required_image = images[np.argmax(face_areas)]
49
+ required_image = Image.fromarray(required_image).convert("RGB") # Ensure 3 channels
50
+ return required_image
51
+
52
+
53
+ # =====================================================
54
+ # EXPRESSION PREDICTION FUNCTION
55
+ # =====================================================
56
  def get_expression(img):
57
  """
58
+ Takes an OpenCV BGR image as input, detects the face, and returns the
59
+ predicted facial expression as a string.
60
  """
 
 
61
 
62
+ # Initialize the model (ViT Face Expression)
63
+ exp_model = facExpRec().to(device)
64
+ exp_model.eval()
65
 
66
  # Detect face
67
  face = detected_face(img)
68
  if face == 0:
69
+ # No face detected fallback to entire image
70
+ face = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
71
+
72
+ # Preprocess image
73
+ inputs = processor(images=face, return_tensors="pt").to(device)
74
+
75
+ # Inference
 
 
 
 
 
76
  with torch.no_grad():
77
+ outputs = exp_model.model(**inputs)
78
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
79
+ pred_idx = torch.argmax(probs, dim=-1).item()
80
+ confidence = probs[0][pred_idx].item()
81
+
82
+ # Map to label
83
+ expression_label = exp_model.processor.config.id2label.get(pred_idx, "Unknown")
84
 
85
+ # Return formatted string or dict (string required by API)
86
+ return expression_label
app/Hackathon_setup/exp_recognition_model.py CHANGED
@@ -1,100 +1,92 @@
1
- import torch.nn as nn
2
- import torchvision.transforms as transforms
3
- import numpy as np
4
- import cv2
5
- from matplotlib import pyplot as plt
 
 
6
  import torch
7
- # In the below line,remove '.' while working on your local system.However Make sure that '.' is present before face_recognition_model while uploading to the server, Do not remove it.
8
- from .exp_recognition_model import *
 
9
  from PIL import Image
10
- import base64
11
- import io
12
- import os
13
-
14
- ####################################################################################################################
15
- # Facial Expression Recognition Model Definition #
16
- # This model is imported and used inside exp_recognition.py #
17
- ####################################################################################################################
18
-
19
- # Classes in the same order as your training data
20
- classes = ['ANGER', 'DISGUST', 'FEAR', 'HAPPINESS', 'NEUTRAL', 'SADNESS', 'SURPRISE']
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # CNN Architecture — must exactly match the model used during training
24
- class ExpressionCNN(nn.Module):
 
 
 
 
 
 
25
  def __init__(self):
26
- super(ExpressionCNN, self).__init__()
27
- self.features = nn.Sequential(
28
- nn.Conv2d(1, 32, 3, padding=1),
29
- nn.BatchNorm2d(32),
30
- nn.ReLU(),
31
- nn.MaxPool2d(2, 2),
32
- nn.Dropout(0.25),
33
-
34
- nn.Conv2d(32, 64, 3, padding=1),
35
- nn.BatchNorm2d(64),
36
- nn.ReLU(),
37
- nn.MaxPool2d(2, 2),
38
- nn.Dropout(0.25),
39
-
40
- nn.Conv2d(64, 128, 3, padding=1),
41
- nn.BatchNorm2d(128),
42
- nn.ReLU(),
43
- nn.MaxPool2d(2, 2)
44
- )
45
-
46
- self.classifier = nn.Sequential(
47
- nn.Linear(128 * 16 * 16, 256),
48
- nn.BatchNorm1d(256),
49
- nn.ReLU(),
50
- nn.Dropout(0.5),
51
- nn.Linear(256, len(classes))
52
- )
53
 
54
  def forward(self, x):
55
- x = self.features(x)
56
- x = x.view(x.size(0), -1)
57
- return self.classifier(x)
58
-
59
-
60
- # Initialize model
61
- expression_model = ExpressionCNN()
62
-
63
- # Load trained weights from .t7 file
64
- current_path = os.path.dirname(os.path.abspath(__file__))
65
- checkpoint_path = os.path.join(current_path, 'expression_model.t7')
66
-
67
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
68
- expression_model.to(device)
69
-
70
- if os.path.exists(checkpoint_path):
71
- try:
72
- checkpoint = torch.load(checkpoint_path, map_location=device)
73
-
74
- # Check if file contains 'model_state_dict' (common for checkpointed models)
75
- if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
76
- expression_model.load_state_dict(checkpoint['model_state_dict'])
77
  else:
78
- expression_model.load_state_dict(checkpoint)
79
-
80
- expression_model.eval()
81
- print(f"Expression model loaded successfully from: {checkpoint_path}")
82
- except Exception as e:
83
- print(f"Error loading model weights: {e}")
84
- else:
85
- print(f"Warning: {checkpoint_path} not found. Expression model not loaded.")
86
-
87
-
88
- # Transformation helper (for consistent preprocessing)
89
- def transform_face(image: Image.Image):
90
- """
91
- Resize, grayscale, and normalize the input image
92
- to match the training preprocessing.
93
- """
94
- transform_pipeline = transforms.Compose([
95
- transforms.Resize((128, 128)),
96
- transforms.Grayscale(num_output_channels=1),
97
- transforms.ToTensor(),
98
- transforms.Normalize(mean=[0.5], std=[0.5])
99
- ])
100
- return transform_pipeline(image)
 
1
+ """
2
+ exp_recognition_model.py
3
+ ------------------------
4
+ Facial Expression Recognition using ViT (trpakov/vit-face-expression).
5
+ This file loads the pretrained model and processor for inference or evaluation.
6
+ """
7
+
8
  import torch
9
+ import torch.nn as nn
10
+ from torchvision import transforms
11
+ from transformers import AutoImageProcessor, AutoModelForImageClassification
12
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # =====================================================
15
+ # DEVICE CONFIGURATION
16
+ # =====================================================
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+ # =====================================================
20
+ # CLASS DEFINITIONS
21
+ # =====================================================
22
+ classes = {
23
+ 0: 'ANGER',
24
+ 1: 'DISGUST',
25
+ 2: 'FEAR',
26
+ 3: 'HAPPINESS',
27
+ 4: 'NEUTRAL',
28
+ 5: 'SADNESS',
29
+ 6: 'SURPRISE'
30
+ }
31
+
32
+ # =====================================================
33
+ # LOAD PRETRAINED VIT FACE EXPRESSION MODEL
34
+ # =====================================================
35
+ MODEL_NAME = "trpakov/vit-face-expression"
36
+
37
+ # The processor handles resize, normalization, etc.
38
+ processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
39
+ model = AutoModelForImageClassification.from_pretrained(MODEL_NAME).to(device)
40
+ model.eval()
41
+
42
+ # =====================================================
43
+ # TRANSFORM / PREPROCESS FUNCTION
44
+ # =====================================================
45
+ def preprocess_image(img_pil: Image.Image):
46
+ """
47
+ Converts a PIL image into ViT-compatible tensors.
48
+ """
49
+ inputs = processor(images=img_pil, return_tensors="pt").to(device)
50
+ return inputs
51
 
52
+ # =====================================================
53
+ # MAIN MODEL WRAPPER CLASS
54
+ # =====================================================
55
+ class facExpRec(nn.Module):
56
+ """
57
+ Expression recognition model wrapper around pretrained ViT Face Expression.
58
+ Provides convenience for inference and integration into app.
59
+ """
60
  def __init__(self):
61
+ super(facExpRec, self).__init__()
62
+ self.model = model
63
+ self.processor = processor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  def forward(self, x):
66
+ """
67
+ Forward expects a PIL image or preprocessed tensor.
68
+ """
69
+ if isinstance(x, Image.Image):
70
+ inputs = self.processor(images=x, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  else:
72
+ inputs = x
73
+
74
+ with torch.no_grad():
75
+ outputs = self.model(**inputs)
76
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
77
+ pred_idx = torch.argmax(probs, dim=-1).item()
78
+ confidence = probs[0][pred_idx].item()
79
+
80
+ return {
81
+ "expression": classes[pred_idx],
82
+ "confidence": round(confidence, 3)
83
+ }
84
+
85
+ # =====================================================
86
+ # TRANSFORMATION FUNCTION (COMPATIBILITY)
87
+ # =====================================================
88
+ # If you need a torchvision-like transform (for ImageFolder, etc.)
89
+ trnscm = transforms.Compose([
90
+ transforms.Resize((224, 224)), # ViT models typically expect 224x224
91
+ transforms.ToTensor()
92
+ ])
 
 
app/Hackathon_setup/expression_model.t7 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c255a0a7d8adf5eaf36f179ecd3408a4ccf89924c7f6d33fe2377a488b451a11
3
- size 33951885