Spaces:
Sleeping
Sleeping
File size: 3,845 Bytes
d7291ef 65933cd d7291ef 1686de5 d7291ef fe5d98f d7291ef fe5d98f d7291ef 65933cd d7291ef 65933cd d7291ef 4c43a48 5778774 65933cd 4c43a48 5778774 65933cd 4c43a48 65933cd 4c43a48 5778774 d7291ef 872dec2 d7291ef 872dec2 d7291ef 4c43a48 65933cd 872dec2 d7291ef 5778774 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
from .vlm_service import VLMService, ModelType
from typing import Dict, Any
import openai
import base64
import asyncio
import json
class GPT4VService(VLMService):
"""GPT-4 Vision service implementation"""
def __init__(self, api_key: str):
super().__init__("GPT4V", ModelType.GPT4V)
self.client = openai.OpenAI(api_key=api_key)
self.model_name = "GPT-4O"
async def generate_caption(self, image_bytes: bytes, prompt: str, metadata_instructions: str = "") -> Dict[str, Any]:
"""Generate caption using GPT-4 Vision"""
try:
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
response = await asyncio.to_thread(
self.client.chat.completions.create,
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt + "\n\n" + metadata_instructions},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
}
}
]
}
],
max_tokens=800
)
content = response.choices[0].message.content
cleaned_content = content.strip()
if cleaned_content.startswith("```json"):
cleaned_content = cleaned_content[7:]
if cleaned_content.endswith("```"):
cleaned_content = cleaned_content[:-3]
cleaned_content = cleaned_content.strip()
metadata = {}
try:
metadata = json.loads(cleaned_content)
except json.JSONDecodeError:
if "```json" in content:
json_start = content.find("```json") + 7
json_end = content.find("```", json_start)
if json_end > json_start:
json_str = content[json_start:json_end].strip()
try:
metadata = json.loads(json_str)
except json.JSONDecodeError as e:
print(f"JSON parse error: {e}")
else:
import re
json_match = re.search(r'\{[^{}]*"metadata"[^{}]*\{[^{}]*\}', content)
if json_match:
try:
metadata = json.loads(json_match.group())
except json.JSONDecodeError:
pass
# Extract the three parts from the parsed JSON
description = metadata.get("description", "")
analysis = metadata.get("analysis", "")
recommended_actions = metadata.get("recommended_actions", "")
# Combine all three parts for backward compatibility
combined_content = f"Description: {description}\n\nAnalysis: {analysis}\n\nRecommended Actions: {recommended_actions}"
return {
"caption": combined_content,
"raw_response": {
"content": content,
"metadata": metadata,
"extracted_metadata": metadata
},
"metadata": metadata,
"description": description,
"analysis": analysis,
"recommended_actions": recommended_actions
}
except Exception as e:
raise Exception(f"GPT-4 Vision API error: {str(e)}") |