File size: 5,535 Bytes
4d4fccb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d667f1f
 
 
 
 
 
 
 
 
 
 
 
 
 
4d4fccb
d667f1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d4fccb
 
 
 
 
 
 
 
 
 
 
 
 
 
ab19ad9
4d4fccb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from smolagents import CodeAgent, OpenAIServerModel
from pathlib import Path
from PIL import Image
from dotenv import load_dotenv
import os
import json
import re

load_dotenv()


class ImageAnalyzerAgent:
    """Agent for describing images using Gemini with smolagents"""
    
    def __init__(self):
        """Initialize the agent with Gemini configuration"""
        # Configure Gemini with smolagents using OpenAI-compatible endpoint
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            raise ValueError("GEMINI_API_KEY not found in environment variables. Get one at https://aistudio.google.com/apikey")
        
        # Use Gemini with smolagents via OpenAI-compatible API
        self.model = OpenAIServerModel(
            model_id="gemini-2.0-flash",
            api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
            api_key=api_key
        )
        
        # Instantiate the agent
        self.agent = CodeAgent(
            tools=[],
            model=self.model,
            max_steps=5,
            verbosity_level=1
        )
    
    def retrieve_similar_images(self, image_path):
        """
        Describe a given image.
        
        Args:
            image_path: Path object or string pointing to an image file
            
        Returns:
            Description text of the image
        """
        image_path = Path(image_path) if isinstance(image_path, str) else image_path
        
        if not image_path.exists():
            raise FileNotFoundError(f"Image not found at {image_path}")
        
        image = Image.open(image_path)
        print(f"Loaded image: {image_path.name}\n")
        
        response = self.agent.run(
            """
            Describe this image in a way that could be used as a prompt for generating a new image inspired by it.
Focus on the main subjects, composition, style, mood, and colors.
Avoid mentioning specific names or exact details — instead, describe the overall aesthetic and atmosphere so the result feels similar but not identical.
            """,
            images=[image]
        )
        
        return response
    
    def analyze_image_metadata(self, image_path):
        """
        Analyze an image and extract structured metadata (tags, description, caption).
        
        Args:
            image_path: Path object or string pointing to an image file
            
        Returns:
            Dictionary with keys: tags (list), description (str), caption (str)
            
        Raises:
            FileNotFoundError: If image file doesn't exist
            ValueError: If response cannot be parsed into valid JSON
        """
        image_path = Path(image_path) if isinstance(image_path, str) else image_path
        
        if not image_path.exists():
            raise FileNotFoundError(f"Image not found at {image_path}")
        
        image = Image.open(image_path)
        print(f"Loaded image: {image_path.name}\n")
        
        prompt = """
Describe this image in the following exact format:

result: {
  "tags": [list of tags related to the image],
  "description": "a 5-line descriptive description for the image",
  "caption": "a short description for the image"
}
        """
        
        response = self.agent.run(prompt, images=[image])
        
        # If response is already a dict, return it directly
        if isinstance(response, dict):
            return response
        
        # Safely convert to string, handling non-string types
        if response is None:
            text_content = ""
        else:
            text_content = str(response).strip()
        
        if not text_content:
            raise ValueError("Model returned empty response")

        # Try to extract JSON-like dict from model output
        try:
            if "{" not in text_content:
                raise ValueError("Response does not contain valid JSON structure (missing opening brace)")
            
            start = text_content.index("{")
            
            # Try to find closing brace
            if "}" not in text_content[start:]:
                # No closing brace found, try adding one
                print(f"[Warning] No closing brace found in response, attempting to add closing brace...")
                json_str = text_content[start:] + "}"
            else:
                end = text_content.rindex("}") + 1
                json_str = text_content[start:end]
            
            result = json.loads(json_str)
            return result
        except ValueError as ve:
            raise ValueError(f"Failed to parse model output: {text_content}\nError: {ve}")
        except json.JSONDecodeError as je:
            raise ValueError(f"Invalid JSON in model output: {text_content}\nError: {je}")
        except Exception as e:
            raise ValueError(f"Failed to parse model output: {text_content}\nError: {e}")


# Test with sample images
if __name__ == "__main__":
    uploads_dir = Path(__file__).parent.parent.parent / "uploads"
    sample_image_paths = [
        uploads_dir / "img_1_20251024_180707_942.jpg",
        uploads_dir / "img_2_20251024_180749_372.jpeg",
        uploads_dir / "img_3_20251024_180756_356.jpeg",
    ]
    
    agent = ImageAnalyzerAgent()
    
    # Test with first sample image
    result = agent.retrieve_similar_images(sample_image_paths[0])
    print(f"\n=== Results ===")
    print(f"Description: {result}")
    # print(f"Similar images found: {len(result['similar_images'])}")