Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- app.py +1 -1
- modules/aesthetic_metrics.py +19 -19
app.py
CHANGED
|
@@ -309,7 +309,7 @@ def create_interface():
|
|
| 309 |
upload_input = gr.File(
|
| 310 |
label="Upload Images (PNG format)",
|
| 311 |
file_count="multiple",
|
| 312 |
-
type="file
|
| 313 |
)
|
| 314 |
upload_button = gr.Button("Process Uploaded Images")
|
| 315 |
|
|
|
|
| 309 |
upload_input = gr.File(
|
| 310 |
label="Upload Images (PNG format)",
|
| 311 |
file_count="multiple",
|
| 312 |
+
type="filepath" # Changed from 'file' to 'filepath'
|
| 313 |
)
|
| 314 |
upload_button = gr.Button("Process Uploaded Images")
|
| 315 |
|
modules/aesthetic_metrics.py
CHANGED
|
@@ -6,9 +6,8 @@ These metrics evaluate subjective aspects of images like aesthetic appeal, compo
|
|
| 6 |
import torch
|
| 7 |
import numpy as np
|
| 8 |
from PIL import Image
|
| 9 |
-
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
|
| 10 |
-
import
|
| 11 |
-
from torchvision import transforms
|
| 12 |
|
| 13 |
|
| 14 |
class AestheticMetrics:
|
|
@@ -21,9 +20,12 @@ class AestheticMetrics:
|
|
| 21 |
|
| 22 |
def _initialize_models(self):
|
| 23 |
"""Initialize all required models."""
|
| 24 |
-
# Initialize CLIP model for text-image similarity
|
| 25 |
try:
|
| 26 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 27 |
self.clip_loaded = True
|
| 28 |
except Exception as e:
|
| 29 |
print(f"Warning: Could not load CLIP model: {e}")
|
|
@@ -203,26 +205,24 @@ class AestheticMetrics:
|
|
| 203 |
return 5.0 # Default middle score if model not loaded or no prompt
|
| 204 |
|
| 205 |
try:
|
| 206 |
-
# Load
|
| 207 |
image = Image.open(image_path).convert('RGB')
|
| 208 |
-
image_input = self.clip_preprocess(image).unsqueeze(0).to(self.device)
|
| 209 |
|
| 210 |
-
# Process
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
# Calculate similarity
|
| 214 |
with torch.no_grad():
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
# Normalize features
|
| 219 |
-
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
| 220 |
-
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
| 221 |
-
|
| 222 |
-
# Calculate similarity
|
| 223 |
-
similarity = (100.0 * image_features @ text_features.T).item()
|
| 224 |
|
| 225 |
-
# Convert to 0-10 scale
|
| 226 |
return min(10, max(0, similarity / 10))
|
| 227 |
except Exception as e:
|
| 228 |
print(f"Error calculating prompt similarity: {e}")
|
|
|
|
| 6 |
import torch
|
| 7 |
import numpy as np
|
| 8 |
from PIL import Image
|
| 9 |
+
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, CLIPProcessor, CLIPModel
|
| 10 |
+
import torchvision.transforms as transforms
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class AestheticMetrics:
|
|
|
|
| 20 |
|
| 21 |
def _initialize_models(self):
|
| 22 |
"""Initialize all required models."""
|
| 23 |
+
# Initialize CLIP model for text-image similarity using transformers
|
| 24 |
try:
|
| 25 |
+
self.clip_model_name = "openai/clip-vit-base-patch32"
|
| 26 |
+
self.clip_processor = CLIPProcessor.from_pretrained(self.clip_model_name)
|
| 27 |
+
self.clip_model = CLIPModel.from_pretrained(self.clip_model_name)
|
| 28 |
+
self.clip_model.to(self.device)
|
| 29 |
self.clip_loaded = True
|
| 30 |
except Exception as e:
|
| 31 |
print(f"Warning: Could not load CLIP model: {e}")
|
|
|
|
| 205 |
return 5.0 # Default middle score if model not loaded or no prompt
|
| 206 |
|
| 207 |
try:
|
| 208 |
+
# Load image
|
| 209 |
image = Image.open(image_path).convert('RGB')
|
|
|
|
| 210 |
|
| 211 |
+
# Process inputs with CLIP processor
|
| 212 |
+
inputs = self.clip_processor(
|
| 213 |
+
text=[prompt],
|
| 214 |
+
images=image,
|
| 215 |
+
return_tensors="pt",
|
| 216 |
+
padding=True
|
| 217 |
+
).to(self.device)
|
| 218 |
|
| 219 |
# Calculate similarity
|
| 220 |
with torch.no_grad():
|
| 221 |
+
outputs = self.clip_model(**inputs)
|
| 222 |
+
logits_per_image = outputs.logits_per_image
|
| 223 |
+
similarity = logits_per_image.item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
# Convert to 0-10 scale (CLIP similarity is typically in 0-100 range)
|
| 226 |
return min(10, max(0, similarity / 10))
|
| 227 |
except Exception as e:
|
| 228 |
print(f"Error calculating prompt similarity: {e}")
|