File size: 2,732 Bytes
d7291ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe5d98f
d7291ef
fe5d98f
d7291ef
 
 
 
 
 
 
5778774
 
 
 
4c43a48
5778774
 
 
 
cb372e4
5778774
 
872dec2
 
 
5778774
872dec2
 
 
 
5778774
 
 
 
 
 
872dec2
 
 
5778774
 
cb372e4
5778774
4c43a48
5778774
 
 
 
 
 
872dec2
 
 
5778774
d7291ef
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from .vlm_service import VLMService, ModelType
from typing import Dict, Any
import asyncio
import time
import re
import json

import google.generativeai as genai


class GeminiService(VLMService):
    """Google Gemini Vision service implementation"""

    def __init__(self, api_key: str, model: str = "gemini-1.5-flash"):
        super().__init__("Gemini", ModelType.GEMINI_PRO_VISION)
        self.model_name = "GEMINI15"
        genai.configure(api_key=api_key)
        self.model_id = model
        self.model = genai.GenerativeModel(self.model_id)

    async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
        """Generate caption using Google Gemini Vision"""
        instruction = prompt + "\n\n" + metadata_instructions

        image_part = {
            "mime_type": "image/jpeg",
            "data": image_bytes,
        }

        start = time.time()
        response = await asyncio.to_thread(self.model.generate_content, [instruction, image_part])
        elapsed = time.time() - start

        content = getattr(response, "text", None) or ""

        cleaned_content = content
        if cleaned_content.startswith("```json"):
            cleaned_content = re.sub(r"^```json\s*", "", cleaned_content)
            cleaned_content = re.sub(r"\s*```$", "", cleaned_content)

        try:
            parsed = json.loads(cleaned_content)
            description = parsed.get("description", "")
            analysis = parsed.get("analysis", "")
            recommended_actions = parsed.get("recommended_actions", "")
            metadata = parsed.get("metadata", {})
            
            # Combine all three parts for backward compatibility
            caption_text = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
            
            if metadata.get("epsg"):
                epsg_value = metadata["epsg"]
                allowed_epsg = ["4326", "3857", "32617", "32633", "32634", "OTHER"]
                if epsg_value not in allowed_epsg:
                    metadata["epsg"] = "OTHER"
        except json.JSONDecodeError:
            description = ""
            analysis = content
            recommended_actions = ""
            caption_text = content
            metadata = {}

        raw_response: Dict[str, Any] = {"model": self.model_id}

        return {
            "caption": caption_text,
            "metadata": metadata,
            "confidence": None,
            "processing_time": elapsed,
            "raw_response": raw_response,
            "description": description,
            "analysis": analysis,
            "recommended_actions": recommended_actions
        }