cloudzy_ai_challenge / cloudzy /agents /image_analyzer_2.py
matinsn2000's picture
Used better model for text embedding
d667f1f
raw
history blame
5.54 kB
from smolagents import CodeAgent, OpenAIServerModel
from pathlib import Path
from PIL import Image
from dotenv import load_dotenv
import os
import json
import re
load_dotenv()
class ImageAnalyzerAgent:
"""Agent for describing images using Gemini with smolagents"""
def __init__(self):
"""Initialize the agent with Gemini configuration"""
# Configure Gemini with smolagents using OpenAI-compatible endpoint
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY not found in environment variables. Get one at https://aistudio.google.com/apikey")
# Use Gemini with smolagents via OpenAI-compatible API
self.model = OpenAIServerModel(
model_id="gemini-2.0-flash",
api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
api_key=api_key
)
# Instantiate the agent
self.agent = CodeAgent(
tools=[],
model=self.model,
max_steps=5,
verbosity_level=1
)
def retrieve_similar_images(self, image_path):
"""
Describe a given image.
Args:
image_path: Path object or string pointing to an image file
Returns:
Description text of the image
"""
image_path = Path(image_path) if isinstance(image_path, str) else image_path
if not image_path.exists():
raise FileNotFoundError(f"Image not found at {image_path}")
image = Image.open(image_path)
print(f"Loaded image: {image_path.name}\n")
response = self.agent.run(
"""
Describe this image in a way that could be used as a prompt for generating a new image inspired by it.
Focus on the main subjects, composition, style, mood, and colors.
Avoid mentioning specific names or exact details — instead, describe the overall aesthetic and atmosphere so the result feels similar but not identical.
""",
images=[image]
)
return response
def analyze_image_metadata(self, image_path):
"""
Analyze an image and extract structured metadata (tags, description, caption).
Args:
image_path: Path object or string pointing to an image file
Returns:
Dictionary with keys: tags (list), description (str), caption (str)
Raises:
FileNotFoundError: If image file doesn't exist
ValueError: If response cannot be parsed into valid JSON
"""
image_path = Path(image_path) if isinstance(image_path, str) else image_path
if not image_path.exists():
raise FileNotFoundError(f"Image not found at {image_path}")
image = Image.open(image_path)
print(f"Loaded image: {image_path.name}\n")
prompt = """
Describe this image in the following exact format:
result: {
"tags": [list of tags related to the image],
"description": "a 5-line descriptive description for the image",
"caption": "a short description for the image"
}
"""
response = self.agent.run(prompt, images=[image])
# If response is already a dict, return it directly
if isinstance(response, dict):
return response
# Safely convert to string, handling non-string types
if response is None:
text_content = ""
else:
text_content = str(response).strip()
if not text_content:
raise ValueError("Model returned empty response")
# Try to extract JSON-like dict from model output
try:
if "{" not in text_content:
raise ValueError("Response does not contain valid JSON structure (missing opening brace)")
start = text_content.index("{")
# Try to find closing brace
if "}" not in text_content[start:]:
# No closing brace found, try adding one
print(f"[Warning] No closing brace found in response, attempting to add closing brace...")
json_str = text_content[start:] + "}"
else:
end = text_content.rindex("}") + 1
json_str = text_content[start:end]
result = json.loads(json_str)
return result
except ValueError as ve:
raise ValueError(f"Failed to parse model output: {text_content}\nError: {ve}")
except json.JSONDecodeError as je:
raise ValueError(f"Invalid JSON in model output: {text_content}\nError: {je}")
except Exception as e:
raise ValueError(f"Failed to parse model output: {text_content}\nError: {e}")
# Test with sample images
if __name__ == "__main__":
uploads_dir = Path(__file__).parent.parent.parent / "uploads"
sample_image_paths = [
uploads_dir / "img_1_20251024_180707_942.jpg",
uploads_dir / "img_2_20251024_180749_372.jpeg",
uploads_dir / "img_3_20251024_180756_356.jpeg",
]
agent = ImageAnalyzerAgent()
# Test with first sample image
result = agent.retrieve_similar_images(sample_image_paths[0])
print(f"\n=== Results ===")
print(f"Description: {result}")
# print(f"Similar images found: {len(result['similar_images'])}")