Spaces:
Running
Running
File size: 5,535 Bytes
4d4fccb d667f1f 4d4fccb d667f1f 4d4fccb ab19ad9 4d4fccb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from smolagents import CodeAgent, OpenAIServerModel
from pathlib import Path
from PIL import Image
from dotenv import load_dotenv
import os
import json
import re
load_dotenv()
class ImageAnalyzerAgent:
"""Agent for describing images using Gemini with smolagents"""
def __init__(self):
"""Initialize the agent with Gemini configuration"""
# Configure Gemini with smolagents using OpenAI-compatible endpoint
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY not found in environment variables. Get one at https://aistudio.google.com/apikey")
# Use Gemini with smolagents via OpenAI-compatible API
self.model = OpenAIServerModel(
model_id="gemini-2.0-flash",
api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
api_key=api_key
)
# Instantiate the agent
self.agent = CodeAgent(
tools=[],
model=self.model,
max_steps=5,
verbosity_level=1
)
def retrieve_similar_images(self, image_path):
"""
Describe a given image.
Args:
image_path: Path object or string pointing to an image file
Returns:
Description text of the image
"""
image_path = Path(image_path) if isinstance(image_path, str) else image_path
if not image_path.exists():
raise FileNotFoundError(f"Image not found at {image_path}")
image = Image.open(image_path)
print(f"Loaded image: {image_path.name}\n")
response = self.agent.run(
"""
Describe this image in a way that could be used as a prompt for generating a new image inspired by it.
Focus on the main subjects, composition, style, mood, and colors.
Avoid mentioning specific names or exact details — instead, describe the overall aesthetic and atmosphere so the result feels similar but not identical.
""",
images=[image]
)
return response
def analyze_image_metadata(self, image_path):
"""
Analyze an image and extract structured metadata (tags, description, caption).
Args:
image_path: Path object or string pointing to an image file
Returns:
Dictionary with keys: tags (list), description (str), caption (str)
Raises:
FileNotFoundError: If image file doesn't exist
ValueError: If response cannot be parsed into valid JSON
"""
image_path = Path(image_path) if isinstance(image_path, str) else image_path
if not image_path.exists():
raise FileNotFoundError(f"Image not found at {image_path}")
image = Image.open(image_path)
print(f"Loaded image: {image_path.name}\n")
prompt = """
Describe this image in the following exact format:
result: {
"tags": [list of tags related to the image],
"description": "a 5-line descriptive description for the image",
"caption": "a short description for the image"
}
"""
response = self.agent.run(prompt, images=[image])
# If response is already a dict, return it directly
if isinstance(response, dict):
return response
# Safely convert to string, handling non-string types
if response is None:
text_content = ""
else:
text_content = str(response).strip()
if not text_content:
raise ValueError("Model returned empty response")
# Try to extract JSON-like dict from model output
try:
if "{" not in text_content:
raise ValueError("Response does not contain valid JSON structure (missing opening brace)")
start = text_content.index("{")
# Try to find closing brace
if "}" not in text_content[start:]:
# No closing brace found, try adding one
print(f"[Warning] No closing brace found in response, attempting to add closing brace...")
json_str = text_content[start:] + "}"
else:
end = text_content.rindex("}") + 1
json_str = text_content[start:end]
result = json.loads(json_str)
return result
except ValueError as ve:
raise ValueError(f"Failed to parse model output: {text_content}\nError: {ve}")
except json.JSONDecodeError as je:
raise ValueError(f"Invalid JSON in model output: {text_content}\nError: {je}")
except Exception as e:
raise ValueError(f"Failed to parse model output: {text_content}\nError: {e}")
# Test with sample images
if __name__ == "__main__":
uploads_dir = Path(__file__).parent.parent.parent / "uploads"
sample_image_paths = [
uploads_dir / "img_1_20251024_180707_942.jpg",
uploads_dir / "img_2_20251024_180749_372.jpeg",
uploads_dir / "img_3_20251024_180756_356.jpeg",
]
agent = ImageAnalyzerAgent()
# Test with first sample image
result = agent.retrieve_similar_images(sample_image_paths[0])
print(f"\n=== Results ===")
print(f"Description: {result}")
# print(f"Similar images found: {len(result['similar_images'])}") |