Spaces:

Kousik831
/

face-similarity-demo

Sleeping

App Files Files Community

Kousik Kumar Siddavaram commited on Oct 28

Commit

bf73c48

1 Parent(s): a35f4a3

Using trpakov/vit-face-expression model for expression recognition

Browse files

Files changed (3) hide show

app/Hackathon_setup/exp_recognition.py +59 -119
app/Hackathon_setup/exp_recognition_model.py +84 -92
app/Hackathon_setup/expression_model.t7 +0 -3

app/Hackathon_setup/exp_recognition.py CHANGED Viewed

@@ -1,146 +1,86 @@
-import torch.nn as nn
-import torchvision.transforms as transforms
-from PIL import Image
 import numpy as np
 import cv2
-from matplotlib import pyplot as plt
 import torch
-# In the below line,remove '.' while working on your local system.However Make sure that '.' is present before face_recognition_model while uploading to the server, Do not remove it.
-from .exp_recognition_model import *
 from PIL import Image
 import base64
 import io
 import os
-####################################################################################################################
-# Expression Recognition Model Definition and Loader
-####################################################################################################################
-# Same class order as used in training
-classes = ['ANGER', 'DISGUST', 'FEAR', 'HAPPINESS', 'NEUTRAL', 'SADNESS', 'SURPRISE']
-# Define CNN architecture (must match training)
-class ExpressionCNN(nn.Module):
-    def __init__(self):
-        super(ExpressionCNN, self).__init__()
-        self.features = nn.Sequential(
-            nn.Conv2d(1, 32, 3, padding=1),
-            nn.BatchNorm2d(32),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2),
-            nn.Dropout(0.25),
-            nn.Conv2d(32, 64, 3, padding=1),
-            nn.BatchNorm2d(64),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2),
-            nn.Dropout(0.25),
-            nn.Conv2d(64, 128, 3, padding=1),
-            nn.BatchNorm2d(128),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2)
-        )
-        self.classifier = nn.Sequential(
-            nn.Linear(128 * 16 * 16, 256),
-            nn.BatchNorm1d(256),
-            nn.ReLU(),
-            nn.Dropout(0.5),
-            nn.Linear(256, len(classes))
-        )
-    def forward(self, x):
-        x = self.features(x)
-        x = x.view(x.size(0), -1)
-        return self.classifier(x)
-# Create model instance
-expression_model = ExpressionCNN()
-# Resolve absolute path to .t7 file
 current_path = os.path.dirname(os.path.abspath(__file__))
-checkpoint_path = os.path.join(current_path, 'expression_model.t7')
-# Load checkpoint safely
-try:
-    if os.path.exists(checkpoint_path):
-        checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
-        if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
-            expression_model.load_state_dict(checkpoint['model_state_dict'])
-            print("Model loaded successfully (wrapped in dict).")
-        elif isinstance(checkpoint, dict):
-            expression_model.load_state_dict(checkpoint)
-            print("Model loaded successfully (state_dict).")
-        else:
-            expression_model = checkpoint
-            print("Entire model object loaded directly.")
-        expression_model.eval()
-    else:
-        print(f"Model file not found at: {checkpoint_path}")
-except Exception as e:
-    print(f"Model Load Error: {e}")
-    expression_model = None
-####################################################################################################################
-# Face Detection Helper
-####################################################################################################################
 def detected_face(image):
     """
-    Detect faces in the image and return the largest detected face.
-    If no face is detected, return 0.
     """
-    face_haar = os.path.join(current_path, 'haarcascade_frontalface_default.xml')
-    eye_haar = os.path.join(current_path, 'haarcascade_eye.xml')
     face_cascade = cv2.CascadeClassifier(face_haar)
     eye_cascade = cv2.CascadeClassifier(eye_haar)
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)
     if len(faces) == 0:
-        return 0  # No face detected
-    # Select the face with largest area
-    face_areas = [w*h for (x,y,w,h) in faces]
-    max_idx = np.argmax(face_areas)
-    x, y, w, h = faces[max_idx]
-    face_cropped = gray[y:y+h, x:x+w]
-    return Image.fromarray(face_cropped)
-####################################################################################################################
-# Main Expression Recognition Function
-####################################################################################################################
 def get_expression(img):
     """
-    Input: img -> OpenCV BGR image
-    Output: Predicted expression as string
     """
-    if expression_model is None:
-        return "Model not loaded"
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    expression_model.to(device)
-    expression_model.eval()
     # Detect face
     face = detected_face(img)
     if face == 0:
-        face = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
-    # Transform pipeline must match training
-    transform = transforms.Compose([
-        transforms.Resize((128, 128)),
-        transforms.Grayscale(num_output_channels=1),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.5], std=[0.5])
-    ])
-    input_tensor = transform(face).unsqueeze(0).to(device)
-    # Predict expression
     with torch.no_grad():
-        outputs = expression_model(input_tensor)
-        _, pred = torch.max(outputs, 1)
-    return classes[pred.item()]

 import numpy as np
 import cv2
 import torch
+from matplotlib import pyplot as plt
 from PIL import Image
 import base64
 import io
 import os
+# Remove '.' if running locally; keep it for server (Spaces)
+from .exp_recognition_model import facExpRec, processor, device
+#############################################################################################################################
+#   Caution: Don't change any of the filenames, function names and definitions                                              #
+#   Always use the current_path + file_name for referring any files, without it we cannot access files on the server         #
+#############################################################################################################################
+# Current path stores absolute path of the file from where it runs.
 current_path = os.path.dirname(os.path.abspath(__file__))
+# =====================================================
+# FACE DETECTION FUNCTION
+# =====================================================
 def detected_face(image):
     """
+    Detects faces using Haar cascades and returns the face with the largest area.
+    Returns 0 if no face detected.
     """
+    eye_haar = current_path + '/haarcascade_eye.xml'
+    face_haar = current_path + '/haarcascade_frontalface_default.xml'
     face_cascade = cv2.CascadeClassifier(face_haar)
     eye_cascade = cv2.CascadeClassifier(eye_haar)
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
     if len(faces) == 0:
+        return 0
+    face_areas = []
+    images = []
+    for (x, y, w, h) in faces:
+        face_cropped = gray[y:y + h, x:x + w]
+        face_areas.append(w * h)
+        images.append(face_cropped)
+    required_image = images[np.argmax(face_areas)]
+    required_image = Image.fromarray(required_image).convert("RGB")  # Ensure 3 channels
+    return required_image
+# =====================================================
+# EXPRESSION PREDICTION FUNCTION
+# =====================================================
 def get_expression(img):
     """
+    Takes an OpenCV BGR image as input, detects the face, and returns the
+    predicted facial expression as a string.
     """
+    # Initialize the model (ViT Face Expression)
+    exp_model = facExpRec().to(device)
+    exp_model.eval()
     # Detect face
     face = detected_face(img)
     if face == 0:
+        # No face detected — fallback to entire image
+        face = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    # Preprocess image
+    inputs = processor(images=face, return_tensors="pt").to(device)
+    # Inference
     with torch.no_grad():
+        outputs = exp_model.model(**inputs)
+        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        pred_idx = torch.argmax(probs, dim=-1).item()
+        confidence = probs[0][pred_idx].item()
+    # Map to label
+    expression_label = exp_model.processor.config.id2label.get(pred_idx, "Unknown")
+    # Return formatted string or dict (string required by API)
+    return expression_label

app/Hackathon_setup/exp_recognition_model.py CHANGED Viewed

@@ -1,100 +1,92 @@
-import torch.nn as nn
-import torchvision.transforms as transforms
-import numpy as np
-import cv2
-from matplotlib import pyplot as plt
 import torch
-# In the below line,remove '.' while working on your local system.However Make sure that '.' is present before face_recognition_model while uploading to the server, Do not remove it.
-from .exp_recognition_model import *
 from PIL import Image
-import base64
-import io
-import os
-####################################################################################################################
-#  Facial Expression Recognition Model Definition                                                                  #
-#  This model is imported and used inside exp_recognition.py                                                       #
-####################################################################################################################
-# Classes in the same order as your training data
-classes = ['ANGER', 'DISGUST', 'FEAR', 'HAPPINESS', 'NEUTRAL', 'SADNESS', 'SURPRISE']
-# CNN Architecture — must exactly match the model used during training
-class ExpressionCNN(nn.Module):
     def __init__(self):
-        super(ExpressionCNN, self).__init__()
-        self.features = nn.Sequential(
-            nn.Conv2d(1, 32, 3, padding=1),
-            nn.BatchNorm2d(32),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2),
-            nn.Dropout(0.25),
-            nn.Conv2d(32, 64, 3, padding=1),
-            nn.BatchNorm2d(64),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2),
-            nn.Dropout(0.25),
-            nn.Conv2d(64, 128, 3, padding=1),
-            nn.BatchNorm2d(128),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2)
-        )
-        self.classifier = nn.Sequential(
-            nn.Linear(128 * 16 * 16, 256),
-            nn.BatchNorm1d(256),
-            nn.ReLU(),
-            nn.Dropout(0.5),
-            nn.Linear(256, len(classes))
-        )
     def forward(self, x):
-        x = self.features(x)
-        x = x.view(x.size(0), -1)
-        return self.classifier(x)
-# Initialize model
-expression_model = ExpressionCNN()
-# Load trained weights from .t7 file
-current_path = os.path.dirname(os.path.abspath(__file__))
-checkpoint_path = os.path.join(current_path, 'expression_model.t7')
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-expression_model.to(device)
-if os.path.exists(checkpoint_path):
-    try:
-        checkpoint = torch.load(checkpoint_path, map_location=device)
-        # Check if file contains 'model_state_dict' (common for checkpointed models)
-        if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
-            expression_model.load_state_dict(checkpoint['model_state_dict'])
         else:
-            expression_model.load_state_dict(checkpoint)
-        expression_model.eval()
-        print(f"Expression model loaded successfully from: {checkpoint_path}")
-    except Exception as e:
-        print(f"Error loading model weights: {e}")
-else:
-    print(f"Warning: {checkpoint_path} not found. Expression model not loaded.")
-# Transformation helper (for consistent preprocessing)
-def transform_face(image: Image.Image):
-    """
-    Resize, grayscale, and normalize the input image
-    to match the training preprocessing.
-    """
-    transform_pipeline = transforms.Compose([
-        transforms.Resize((128, 128)),
-        transforms.Grayscale(num_output_channels=1),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.5], std=[0.5])
-    ])
-    return transform_pipeline(image)

+"""
+exp_recognition_model.py
+------------------------
+Facial Expression Recognition using ViT (trpakov/vit-face-expression).
+This file loads the pretrained model and processor for inference or evaluation.
+"""
 import torch
+import torch.nn as nn
+from torchvision import transforms
+from transformers import AutoImageProcessor, AutoModelForImageClassification
 from PIL import Image
+# =====================================================
+# DEVICE CONFIGURATION
+# =====================================================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# =====================================================
+# CLASS DEFINITIONS
+# =====================================================
+classes = {
+    0: 'ANGER',
+    1: 'DISGUST',
+    2: 'FEAR',
+    3: 'HAPPINESS',
+    4: 'NEUTRAL',
+    5: 'SADNESS',
+    6: 'SURPRISE'
+}
+# =====================================================
+# LOAD PRETRAINED VIT FACE EXPRESSION MODEL
+# =====================================================
+MODEL_NAME = "trpakov/vit-face-expression"
+# The processor handles resize, normalization, etc.
+processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
+model = AutoModelForImageClassification.from_pretrained(MODEL_NAME).to(device)
+model.eval()
+# =====================================================
+# TRANSFORM / PREPROCESS FUNCTION
+# =====================================================
+def preprocess_image(img_pil: Image.Image):
+    """
+    Converts a PIL image into ViT-compatible tensors.
+    """
+    inputs = processor(images=img_pil, return_tensors="pt").to(device)
+    return inputs
+# =====================================================
+# MAIN MODEL WRAPPER CLASS
+# =====================================================
+class facExpRec(nn.Module):
+    """
+    Expression recognition model wrapper around pretrained ViT Face Expression.
+    Provides convenience for inference and integration into app.
+    """
     def __init__(self):
+        super(facExpRec, self).__init__()
+        self.model = model
+        self.processor = processor
     def forward(self, x):
+        """
+        Forward expects a PIL image or preprocessed tensor.
+        """
+        if isinstance(x, Image.Image):
+            inputs = self.processor(images=x, return_tensors="pt").to(device)
         else:
+            inputs = x
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            pred_idx = torch.argmax(probs, dim=-1).item()
+            confidence = probs[0][pred_idx].item()
+        return {
+            "expression": classes[pred_idx],
+            "confidence": round(confidence, 3)
+        }
+# =====================================================
+# TRANSFORMATION FUNCTION (COMPATIBILITY)
+# =====================================================
+# If you need a torchvision-like transform (for ImageFolder, etc.)
+trnscm = transforms.Compose([
+    transforms.Resize((224, 224)),  # ViT models typically expect 224x224
+    transforms.ToTensor()
+])

app/Hackathon_setup/expression_model.t7 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c255a0a7d8adf5eaf36f179ecd3408a4ccf89924c7f6d33fe2377a488b451a11
-size 33951885