Spaces:

userx2000
/

cloudzy_ai_challenge

Running

App Files Files Community

cloudzy_ai_challenge / cloudzy /agents /image_analyzer_2.py

matinsn2000

Used better model for text embedding

d667f1f 8 days ago

raw

history blame

5.54 kB

	from smolagents import CodeAgent, OpenAIServerModel
	from pathlib import Path
	from PIL import Image
	from dotenv import load_dotenv
	import os
	import json
	import re

	load_dotenv()


	class ImageAnalyzerAgent:
	"""Agent for describing images using Gemini with smolagents"""

	def __init__(self):
	"""Initialize the agent with Gemini configuration"""
	# Configure Gemini with smolagents using OpenAI-compatible endpoint
	api_key = os.getenv("GEMINI_API_KEY")
	if not api_key:
	raise ValueError("GEMINI_API_KEY not found in environment variables. Get one at https://aistudio.google.com/apikey")

	# Use Gemini with smolagents via OpenAI-compatible API
	self.model = OpenAIServerModel(
	model_id="gemini-2.0-flash",
	api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
	api_key=api_key
	)

	# Instantiate the agent
	self.agent = CodeAgent(
	tools=[],
	model=self.model,
	max_steps=5,
	verbosity_level=1
	)

	def retrieve_similar_images(self, image_path):
	"""
	Describe a given image.

	Args:
	image_path: Path object or string pointing to an image file

	Returns:
	Description text of the image
	"""
	image_path = Path(image_path) if isinstance(image_path, str) else image_path

	if not image_path.exists():
	raise FileNotFoundError(f"Image not found at {image_path}")

	image = Image.open(image_path)
	print(f"Loaded image: {image_path.name}\n")

	response = self.agent.run(
	"""
	Describe this image in a way that could be used as a prompt for generating a new image inspired by it.
	Focus on the main subjects, composition, style, mood, and colors.
	Avoid mentioning specific names or exact details — instead, describe the overall aesthetic and atmosphere so the result feels similar but not identical.
	""",
	images=[image]
	)

	return response

	def analyze_image_metadata(self, image_path):
	"""
	Analyze an image and extract structured metadata (tags, description, caption).

	Args:
	image_path: Path object or string pointing to an image file

	Returns:
	Dictionary with keys: tags (list), description (str), caption (str)

	Raises:
	FileNotFoundError: If image file doesn't exist
	ValueError: If response cannot be parsed into valid JSON
	"""
	image_path = Path(image_path) if isinstance(image_path, str) else image_path

	if not image_path.exists():
	raise FileNotFoundError(f"Image not found at {image_path}")

	image = Image.open(image_path)
	print(f"Loaded image: {image_path.name}\n")

	prompt = """
	Describe this image in the following exact format:

	result: {
	"tags": [list of tags related to the image],
	"description": "a 5-line descriptive description for the image",
	"caption": "a short description for the image"
	}
	"""

	response = self.agent.run(prompt, images=[image])

	# If response is already a dict, return it directly
	if isinstance(response, dict):
	return response

	# Safely convert to string, handling non-string types
	if response is None:
	text_content = ""
	else:
	text_content = str(response).strip()

	if not text_content:
	raise ValueError("Model returned empty response")

	# Try to extract JSON-like dict from model output
	try:
	if "{" not in text_content:
	raise ValueError("Response does not contain valid JSON structure (missing opening brace)")

	start = text_content.index("{")

	# Try to find closing brace
	if "}" not in text_content[start:]:
	# No closing brace found, try adding one
	print(f"[Warning] No closing brace found in response, attempting to add closing brace...")
	json_str = text_content[start:] + "}"
	else:
	end = text_content.rindex("}") + 1
	json_str = text_content[start:end]

	result = json.loads(json_str)
	return result
	except ValueError as ve:
	raise ValueError(f"Failed to parse model output: {text_content}\nError: {ve}")
	except json.JSONDecodeError as je:
	raise ValueError(f"Invalid JSON in model output: {text_content}\nError: {je}")
	except Exception as e:
	raise ValueError(f"Failed to parse model output: {text_content}\nError: {e}")


	# Test with sample images
	if __name__ == "__main__":
	uploads_dir = Path(__file__).parent.parent.parent / "uploads"
	sample_image_paths = [
	uploads_dir / "img_1_20251024_180707_942.jpg",
	uploads_dir / "img_2_20251024_180749_372.jpeg",
	uploads_dir / "img_3_20251024_180756_356.jpeg",
	]

	agent = ImageAnalyzerAgent()

	# Test with first sample image
	result = agent.retrieve_similar_images(sample_image_paths[0])
	print(f"\n=== Results ===")
	print(f"Description: {result}")
	# print(f"Similar images found: {len(result['similar_images'])}")