VOIDER commited on
Commit
e196a20
·
verified ·
1 Parent(s): f89e218

Upload 11 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. modules/aesthetic_metrics.py +19 -19
app.py CHANGED
@@ -309,7 +309,7 @@ def create_interface():
309
  upload_input = gr.File(
310
  label="Upload Images (PNG format)",
311
  file_count="multiple",
312
- type="file"
313
  )
314
  upload_button = gr.Button("Process Uploaded Images")
315
 
 
309
  upload_input = gr.File(
310
  label="Upload Images (PNG format)",
311
  file_count="multiple",
312
+ type="filepath" # Changed from 'file' to 'filepath'
313
  )
314
  upload_button = gr.Button("Process Uploaded Images")
315
 
modules/aesthetic_metrics.py CHANGED
@@ -6,9 +6,8 @@ These metrics evaluate subjective aspects of images like aesthetic appeal, compo
6
  import torch
7
  import numpy as np
8
  from PIL import Image
9
- from transformers import AutoFeatureExtractor, AutoModelForImageClassification
10
- import clip
11
- from torchvision import transforms
12
 
13
 
14
  class AestheticMetrics:
@@ -21,9 +20,12 @@ class AestheticMetrics:
21
 
22
  def _initialize_models(self):
23
  """Initialize all required models."""
24
- # Initialize CLIP model for text-image similarity
25
  try:
26
- self.clip_model, self.clip_preprocess = clip.load("ViT-B/32", device=self.device)
 
 
 
27
  self.clip_loaded = True
28
  except Exception as e:
29
  print(f"Warning: Could not load CLIP model: {e}")
@@ -203,26 +205,24 @@ class AestheticMetrics:
203
  return 5.0 # Default middle score if model not loaded or no prompt
204
 
205
  try:
206
- # Load and preprocess image
207
  image = Image.open(image_path).convert('RGB')
208
- image_input = self.clip_preprocess(image).unsqueeze(0).to(self.device)
209
 
210
- # Process text
211
- text_input = clip.tokenize([prompt]).to(self.device)
 
 
 
 
 
212
 
213
  # Calculate similarity
214
  with torch.no_grad():
215
- image_features = self.clip_model.encode_image(image_input)
216
- text_features = self.clip_model.encode_text(text_input)
217
-
218
- # Normalize features
219
- image_features = image_features / image_features.norm(dim=-1, keepdim=True)
220
- text_features = text_features / text_features.norm(dim=-1, keepdim=True)
221
-
222
- # Calculate similarity
223
- similarity = (100.0 * image_features @ text_features.T).item()
224
 
225
- # Convert to 0-10 scale
226
  return min(10, max(0, similarity / 10))
227
  except Exception as e:
228
  print(f"Error calculating prompt similarity: {e}")
 
6
  import torch
7
  import numpy as np
8
  from PIL import Image
9
+ from transformers import AutoFeatureExtractor, AutoModelForImageClassification, CLIPProcessor, CLIPModel
10
+ import torchvision.transforms as transforms
 
11
 
12
 
13
  class AestheticMetrics:
 
20
 
21
  def _initialize_models(self):
22
  """Initialize all required models."""
23
+ # Initialize CLIP model for text-image similarity using transformers
24
  try:
25
+ self.clip_model_name = "openai/clip-vit-base-patch32"
26
+ self.clip_processor = CLIPProcessor.from_pretrained(self.clip_model_name)
27
+ self.clip_model = CLIPModel.from_pretrained(self.clip_model_name)
28
+ self.clip_model.to(self.device)
29
  self.clip_loaded = True
30
  except Exception as e:
31
  print(f"Warning: Could not load CLIP model: {e}")
 
205
  return 5.0 # Default middle score if model not loaded or no prompt
206
 
207
  try:
208
+ # Load image
209
  image = Image.open(image_path).convert('RGB')
 
210
 
211
+ # Process inputs with CLIP processor
212
+ inputs = self.clip_processor(
213
+ text=[prompt],
214
+ images=image,
215
+ return_tensors="pt",
216
+ padding=True
217
+ ).to(self.device)
218
 
219
  # Calculate similarity
220
  with torch.no_grad():
221
+ outputs = self.clip_model(**inputs)
222
+ logits_per_image = outputs.logits_per_image
223
+ similarity = logits_per_image.item()
 
 
 
 
 
 
224
 
225
+ # Convert to 0-10 scale (CLIP similarity is typically in 0-100 range)
226
  return min(10, max(0, similarity / 10))
227
  except Exception as e:
228
  print(f"Error calculating prompt similarity: {e}")