Spaces:
Sleeping
Sleeping
File size: 3,691 Bytes
1006fab 8ad42f5 1006fab 8779583 1006fab 9f1f56b 1006fab 9f1f56b 1006fab 1cb8b50 9f1f56b 1006fab 1cb8b50 1006fab 9f1f56b 1006fab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import os
import json
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
class ImageDescriber:
"""
Class for generating descriptive metadata (tags, description, caption)
for an image using Hugging Face's inference endpoint via OpenAI client.
"""
def __init__(self):
# Read token from environment variable
api_key = os.getenv("HF_TOKEN_1")
if not api_key:
raise ValueError("Environment variable HF_TOKEN_1 is not set.")
# Initialize client
self.client = OpenAI(
base_url="https://router.huggingface.co/v1",
api_key=api_key,
)
# Model to use
self.model = "Qwen/Qwen3-VL-8B-Instruct:novita"
def describe_image(self, image_url: str) -> dict:
"""
Sends the image to the model and returns a structured dictionary:
{
"tags": [...],
"description": "...",
"caption": "..."
}
"""
# Prompt for structured output
prompt = """
Describe this image in the following exact format:
result: {
"tags": [list of tags related to the image],
"description": "a 5-line descriptive description for the image",
"caption": "a short description for the image"
}
"""
# Send request
completion = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
],
)
# Extract message text with robust type handling
message = completion.choices[0].message
# Safely convert to string, handling non-string types
if message.content is None:
text_content = ""
else:
text_content = str(message.content).strip()
if not text_content:
raise ValueError("Model returned empty response")
# Try to extract JSON-like dict from model output
try:
if "{" not in text_content:
raise ValueError("Response does not contain valid JSON structure (missing opening brace)")
start = text_content.index("{")
# Try to find closing brace
if "}" not in text_content[start:]:
# No closing brace found, try adding one
print(f"[Warning] No closing brace found in response, attempting to add closing brace...")
json_str = text_content[start:] + "}"
else:
end = text_content.rindex("}") + 1
json_str = text_content[start:end]
result = json.loads(json_str)
except ValueError as ve:
raise ValueError(f"Failed to parse model output: {text_content}\nError: {ve}")
except json.JSONDecodeError as je:
raise ValueError(f"Invalid JSON in model output: {text_content}\nError: {je}")
except Exception as e:
raise ValueError(f"Failed to parse model output: {text_content}\nError: {e}")
return result
def main():
"""
Entry point: takes image URL as input and prints parsed description.
"""
describer = ImageDescriber()
result = describer.describe_image("https://userx2000-cloudzy-ai-challenge.hf.space/uploads/img_2_20251024_082115_102.jpeg")
print("\n✅ Extracted Result:\n")
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()
|