Adanbalf commited on
Commit
c3fead6
·
verified ·
1 Parent(s): b5ead13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -131
app.py CHANGED
@@ -1,143 +1,103 @@
1
- import base64
2
- import mimetypes
3
  import os
4
- from pathlib import Path
5
- from typing import Any, Dict, List
6
-
7
  import gradio as gr
8
- from openai import OpenAI
9
-
10
-
11
-
12
-
13
- # Modelo por defecto
14
- DEFAULT_MODEL = "LLaVA-OneVision-1.5-8B-Instruct"
15
-
16
- # Cliente OpenAI-compatible (usa el endpoint de Hugging Face o el tuyo)
17
- _client = OpenAI(
18
- base_url=os.getenv("BASE_URL", ""),
19
- api_key=os.getenv("API_KEY", ""),
20
- )
21
-
22
-
23
- def _data_url(path: str) -> str:
24
- mime, _ = mimetypes.guess_type(path)
25
- mime = mime or "application/octet-stream"
26
- data = base64.b64encode(Path(path).read_bytes()).decode("utf-8")
27
- return f"data:{mime};base64,{data}"
28
-
29
-
30
- def _image_content(path: str) -> Dict[str, Any]:
31
- return {"type": "image_url", "image_url": {"url": _data_url(path)}}
32
-
33
-
34
- def _text_content(text: str) -> Dict[str, Any]:
35
- return {"type": "text", "text": text}
36
-
37
-
38
- def _message(role: str, content: Any) -> Dict[str, Any]:
39
- return {"role": role, "content": content}
40
-
41
-
42
- def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]:
43
- files = message.get("files") or []
44
- text = (message.get("text") or "").strip()
45
-
46
- # 🔹 Si no hay texto, añadimos un prompt nutricional por defecto
47
- if not text:
48
- text = (
49
- "Analiza la imagen del plato de comida y describe los alimentos que contiene. "
50
- "Indica una estimación de calorías, proteínas, carbohidratos y grasas. "
51
- "Responde en formato breve y estructurado."
52
  )
53
 
54
- content: List[Dict[str, Any]] = [_image_content(p) for p in files]
55
- if text:
56
- content.append(_text_content(text))
57
- return _message("user", content)
58
-
59
-
60
- def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
61
- msgs: List[Dict[str, Any]] = []
62
- user_content: List[Dict[str, Any]] = []
63
-
64
- for turn in history or []:
65
- role, content = turn.get("role"), turn.get("content")
66
- if role == "user":
67
- if isinstance(content, str):
68
- user_content.append(_text_content(content))
69
- elif isinstance(content, tuple):
70
- user_content.extend(_image_content(path) for path in content if path)
71
- elif role == "assistant":
72
- msgs.append(_message("user", user_content.copy()))
73
- user_content.clear()
74
- msgs.append(_message("assistant", content))
75
- return msgs
76
-
77
-
78
- def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL):
79
- messages = _convert_history(history)
80
- messages.append(_build_user_message(message))
81
  try:
82
- stream = _client.chat.completions.create(
83
- model=model_name,
84
- messages=messages,
85
- temperature=0.1,
86
- top_p=1,
87
- extra_body={
88
- "repetition_penalty": 1.05,
89
- "frequency_penalty": 0,
90
- "presence_penalty": 0
91
- },
92
- stream=True
93
- )
94
- partial = ""
95
- for chunk in stream:
96
- delta = chunk.choices[0].delta.content
97
- if delta:
98
- partial += delta
99
- yield partial
100
- except Exception as e:
101
- yield f"⚠️ Error al obtener respuesta: {e}"
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- def build_demo() -> gr.Blocks:
105
- chatbot = gr.Chatbot(type="messages", allow_tags=["think"])
106
- textbox = gr.MultimodalTextbox(
107
- show_label=False,
108
- placeholder="Subí una foto de tu comida para analizarla...",
109
- file_types=["image"],
110
- file_count="single",
111
- max_plain_text_length=32768
112
- )
113
- model_selector = gr.Dropdown(
114
- label="Modelo",
115
- choices=[
116
- ("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"),
117
- ("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"),
118
- ],
119
- value=DEFAULT_MODEL,
120
- )
121
- return gr.ChatInterface(
122
- fn=stream_response,
123
- type="messages",
124
- multimodal=True,
125
- chatbot=chatbot,
126
- textbox=textbox,
127
- title="🍽️ NasFit Vision AI",
128
- description=(
129
- "Subí una foto de tu comida y NasFit IA estimará su contenido nutricional. "
130
- "Basado en **LLaVA-OneVision-1.5**, modelo multimodal open source con análisis visual avanzado. "
131
- "Ideal para tracking nutricional inteligente."
132
- ),
133
- additional_inputs=[model_selector],
134
- additional_inputs_accordion=gr.Accordion("Opciones avanzadas", open=False),
135
- ).queue(default_concurrency_limit=8)
136
 
 
 
 
 
 
 
137
 
138
- def main():
139
- build_demo().launch()
140
 
141
 
142
  if __name__ == "__main__":
143
- main()
 
 
 
 
1
  import os
 
 
 
2
  import gradio as gr
3
+ import torch
4
+ from PIL import Image
5
+ from transformers import AutoProcessor, AutoModelForVision2Seq
6
+ import requests
7
+
8
+ # Configuración
9
+ LOCAL_MODEL_ID = "lmms-lab/llava-onevision-1.5-8b-instruct"
10
+ API_MODEL_ID = "lmms-lab/llava-onevision-1.5-8b-instruct"
11
+ HF_API_URL = f"https://api-inference.huggingface.co/models/{API_MODEL_ID}"
12
+ HF_API_KEY = os.getenv("API_KEY")
13
+
14
+ # Inicializa modelo local (si hay GPU)
15
+ model, processor = None, None
16
+ use_local = False
17
+
18
+ try:
19
+ print("⏳ Intentando cargar modelo local...")
20
+ processor = AutoProcessor.from_pretrained(LOCAL_MODEL_ID)
21
+ model = AutoModelForVision2Seq.from_pretrained(
22
+ LOCAL_MODEL_ID,
23
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
24
+ device_map="auto"
25
+ )
26
+ use_local = True
27
+ print("✅ Modelo local cargado correctamente.")
28
+ except Exception as e:
29
+ print(f"⚠️ No se pudo cargar el modelo local: {e}")
30
+ print("➡️ Se usará la API de Hugging Face para inferencia remota.")
31
+
32
+ # Función principal
33
+ def analyze_food(image, text_prompt=""):
34
+ if image is None:
35
+ return "Por favor, subí una imagen del plato."
36
+
37
+ if not text_prompt.strip():
38
+ text_prompt = (
39
+ "Analiza esta comida. Describe los alimentos, "
40
+ "y estima las calorías, proteínas, carbohidratos y grasas totales."
 
 
 
 
 
 
41
  )
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  try:
44
+ if use_local:
45
+ # Procesamiento local
46
+ inputs = processor(text=text_prompt, images=image, return_tensors="pt").to(model.device)
47
+ output = model.generate(**inputs, max_new_tokens=300)
48
+ answer = processor.decode(output[0], skip_special_tokens=True)
49
+ return answer
50
+
51
+ else:
52
+ # Fallback: usar API de Hugging Face
53
+ headers = {"Authorization": f"Bearer {HF_API_KEY}"}
54
+ data = {
55
+ "inputs": {"image": image, "text": text_prompt},
56
+ "parameters": {"max_new_tokens": 300},
57
+ }
58
+ response = requests.post(HF_API_URL, headers=headers, json=data)
59
+ if response.status_code != 200:
60
+ return f"❌ Error remoto ({response.status_code}): {response.text}"
61
+ result = response.json()
62
+ if isinstance(result, dict) and "error" in result:
63
+ return f"⚠️ Error remoto: {result['error']}"
64
+ return str(result)
65
 
66
+ except Exception as e:
67
+ return f"⚠️ Ocurrió un error al procesar la imagen: {e}"
68
+
69
+ # Interfaz Gradio
70
+ def build_interface():
71
+ with gr.Blocks() as demo:
72
+ gr.Markdown(
73
+ """
74
+ # 🍽️ NasFit Vision AI
75
+ Subí una foto de tu comida y NasFit IA estimará su contenido nutricional.
76
+ Basado en **LLaVA-OneVision-1.5**, modelo multimodal open source con análisis visual avanzado.
77
+ *(El sistema usa GPU local si está disponible, o la API de Hugging Face si no lo está.)*
78
+ """
79
+ )
80
 
81
+ with gr.Row():
82
+ with gr.Column(scale=1):
83
+ image_input = gr.Image(label="📸 Imagen del plato", type="pil")
84
+ text_input = gr.Textbox(
85
+ label="💬 Instrucción (opcional)",
86
+ placeholder="Ejemplo: Cuántas proteínas tiene este plato?",
87
+ )
88
+ analyze_btn = gr.Button("🔍 Analizar comida")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ with gr.Column(scale=1):
91
+ output_text = gr.Textbox(
92
+ label="🧠 Resultado del análisis",
93
+ placeholder="Aquí aparecerá la descripción nutricional...",
94
+ lines=8
95
+ )
96
 
97
+ analyze_btn.click(fn=analyze_food, inputs=[image_input, text_input], outputs=output_text)
98
+ return demo
99
 
100
 
101
  if __name__ == "__main__":
102
+ demo = build_interface()
103
+ demo.launch()