|
|
|
|
|
readme_content = '''# 🦙 Llama 3.2 3B Chat - Hugging Face Space |
|
|
|
|
|
Un Space de Hugging Face para chatear con Meta Llama 3.2 3B Instruct con sistema de colas, streaming y API para cliente Python. |
|
|
|
|
|
## ✨ Características |
|
|
|
|
|
- 🔄 **Sistema de colas**: Solo procesa una petición a la vez para evitar sobrecargar el modelo |
|
|
- 📡 **Streaming en tiempo real**: Ve la respuesta generándose en tiempo real |
|
|
- 🐍 **Cliente Python**: API completa para integración con aplicaciones Python |
|
|
- 💬 **Interfaz web**: Chat interactivo con sistema de prompts y configuración |
|
|
- 📊 **Monitoreo**: Estado de cola en tiempo real |
|
|
- 🔐 **Autenticación**: Soporte para modelos restringidos con HF token |
|
|
|
|
|
## 🚀 Configuración del Space |
|
|
|
|
|
### 1. Crear el Space |
|
|
|
|
|
1. Ve a [Hugging Face Spaces](https://huggingface.co/new-space) |
|
|
2. Elige **Gradio** como SDK |
|
|
3. Selecciona **T4 small** o superior como hardware |
|
|
4. Nombra tu Space (ej: `tu-usuario/llama-chat`) |
|
|
|
|
|
### 2. Configurar el token HF |
|
|
|
|
|
1. Ve a **Settings** de tu Space |
|
|
2. En **Repository secrets**, agrega: |
|
|
- **Name**: `HF_TOKEN` |
|
|
- **Value**: Tu token de Hugging Face (con acceso a Llama) |
|
|
|
|
|
### 3. Subir archivos |
|
|
|
|
|
Sube estos archivos a tu Space: |
|
|
- `app.py` (aplicación principal) |
|
|
- `requirements.txt` (dependencias) |
|
|
|
|
|
### 4. Verificar el despliegue |
|
|
|
|
|
Una vez que el Space esté corriendo, deberías ver: |
|
|
- Una interfaz de chat en la pestaña principal |
|
|
- Un endpoint API en la segunda pestaña |
|
|
- Estado de cola actualizado automáticamente |
|
|
|
|
|
## 📱 Uso de la interfaz web |
|
|
|
|
|
### Chat Principal |
|
|
- **System Prompt**: Define el comportamiento del asistente |
|
|
- **Mensaje**: Tu pregunta o mensaje |
|
|
- **Max Tokens**: Longitud máxima de la respuesta (50-1024) |
|
|
- **Temperature**: Creatividad de la respuesta (0.1-2.0) |
|
|
|
|
|
### Estado de Cola |
|
|
- **queue_size**: Número de peticiones en espera |
|
|
- **is_processing**: Si está procesando actualmente |
|
|
- **timestamp**: Última actualización |
|
|
|
|
|
## 🐍 Cliente Python |
|
|
|
|
|
### Instalación |
|
|
|
|
|
```bash |
|
|
pip install requests |
|
|
``` |
|
|
|
|
|
### Uso Básico |
|
|
|
|
|
```python |
|
|
from client import LlamaClient |
|
|
|
|
|
# Inicializar cliente con la URL de tu Space |
|
|
client = LlamaClient("https://tu-usuario-llama-chat.hf.space") |
|
|
|
|
|
# Chat simple |
|
|
response = client.chat( |
|
|
message="¿Qué es la inteligencia artificial?", |
|
|
system_prompt="Eres un profesor experto." |
|
|
) |
|
|
|
|
|
print(response["response"]) |
|
|
``` |
|
|
|
|
|
### Chat con Streaming |
|
|
|
|
|
```python |
|
|
# Ver respuesta generándose en tiempo real |
|
|
for chunk in client.chat_stream( |
|
|
message="Explica la física cuántica", |
|
|
system_prompt="Eres un divulgador científico.", |
|
|
max_tokens=300 |
|
|
): |
|
|
print(f"\\r{chunk['response']}", end="", flush=True) |
|
|
|
|
|
if chunk.get("is_complete", False): |
|
|
print("\\n[Completo]") |
|
|
break |
|
|
``` |
|
|
|
|
|
### Chat con Historial |
|
|
|
|
|
```python |
|
|
# Mantener conversación |
|
|
history = [ |
|
|
["Hola", "¡Hola! ¿En qué puedo ayudarte?"], |
|
|
["Explica el machine learning", "El machine learning es..."] |
|
|
] |
|
|
|
|
|
response = client.chat( |
|
|
message="¿Puedes dar un ejemplo práctico?", |
|
|
history=history |
|
|
) |
|
|
``` |
|
|
|
|
|
## 🔧 API Endpoints |
|
|
|
|
|
### POST /call/api_chat |
|
|
Respuesta completa sin streaming. |
|
|
|
|
|
**Payload:** |
|
|
```json |
|
|
{ |
|
|
"data": [ |
|
|
"system_prompt", |
|
|
"message", |
|
|
[["user", "assistant"], ...], |
|
|
512, |
|
|
0.7 |
|
|
] |
|
|
} |
|
|
``` |
|
|
|
|
|
**Respuesta:** |
|
|
```json |
|
|
{ |
|
|
"data": [{ |
|
|
"response": "Respuesta del modelo", |
|
|
"queue_status": { |
|
|
"queue_size": 0, |
|
|
"is_processing": false, |
|
|
"timestamp": "2025-10-16T17:30:00" |
|
|
} |
|
|
}] |
|
|
} |
|
|
``` |
|
|
|
|
|
### POST /call/api_chat_stream |
|
|
Respuesta con streaming. |
|
|
|
|
|
Misma estructura de payload, pero responde con eventos SSE. |
|
|
|
|
|
## 📊 Monitoreo y Debugging |
|
|
|
|
|
### Logs del Space |
|
|
Revisa los logs en la interfaz de HF Spaces para debugging. |
|
|
|
|
|
### Estado de Cola |
|
|
Usa `client.get_queue_status()` para monitorear la cola: |
|
|
|
|
|
```python |
|
|
status = client.get_queue_status() |
|
|
print(f"Cola: {status['queue_size']} peticiones") |
|
|
print(f"Procesando: {status['is_processing']}") |
|
|
``` |
|
|
|
|
|
### Manejo de Errores |
|
|
|
|
|
```python |
|
|
response = client.chat("Hola") |
|
|
|
|
|
if "error" in response: |
|
|
print(f"Error: {response['error']}") |
|
|
else: |
|
|
print(f"Respuesta: {response['response']}") |
|
|
``` |
|
|
|
|
|
## ⚙️ Configuración Avanzada |
|
|
|
|
|
### Parámetros del Modelo |
|
|
|
|
|
- **max_tokens**: 50-1024 (recomendado: 512) |
|
|
- **temperature**: 0.1-2.0 (recomendado: 0.7) |
|
|
- **repetition_penalty**: Automático (1.1) |
|
|
|
|
|
### Optimización de Performance |
|
|
|
|
|
1. **Hardware**: Usa GPU T4 small mínimo |
|
|
2. **Batch size**: Sistema de colas evita problemas de memoria |
|
|
3. **Context length**: Máximo 2048 tokens de entrada |
|
|
|
|
|
### System Prompts Útiles |
|
|
|
|
|
```python |
|
|
# Para tareas académicas |
|
|
system_prompt = "Eres un tutor experto que explica conceptos complejos de forma clara y pedagógica." |
|
|
|
|
|
# Para programación |
|
|
system_prompt = "Eres un desarrollador senior que ayuda con código Python, explicando paso a paso." |
|
|
|
|
|
# Para creatividad |
|
|
system_prompt = "Eres un escritor creativo que ayuda a generar ideas originales y contenido engagente." |
|
|
``` |
|
|
|
|
|
## 🐛 Troubleshooting |
|
|
|
|
|
### Error: HF_TOKEN no encontrado |
|
|
- Verifica que agregaste el token en Repository secrets |
|
|
- Asegúrate que el nombre sea exactamente `HF_TOKEN` |
|
|
|
|
|
### Error: Modelo no disponible |
|
|
- Tu token debe tener acceso a Llama 3.2 3B |
|
|
- Solicita acceso en la página del modelo si es necesario |
|
|
|
|
|
### Timeouts en cliente Python |
|
|
- Aumenta el timeout: `requests.post(..., timeout=600)` |
|
|
- El modelo puede tardar en cargar la primera vez |
|
|
|
|
|
### Cola muy larga |
|
|
- El sistema procesa una petición a la vez |
|
|
- Considera usar hardware más potente |
|
|
|
|
|
## 🤝 Contribuciones |
|
|
|
|
|
¿Mejoras sugeridas? |
|
|
1. Fork el código |
|
|
2. Implementa mejoras |
|
|
3. Prueba con tu propio Space |
|
|
4. Comparte tu versión |
|
|
|
|
|
## 📝 Licencia |
|
|
|
|
|
Este código es de uso libre. Respeta los términos de uso de: |
|
|
- Hugging Face Spaces |
|
|
- Meta Llama 3.2 License |
|
|
- Gradio License |
|
|
|
|
|
## 🔗 Enlaces Útiles |
|
|
|
|
|
- [Hugging Face Spaces](https://huggingface.co/spaces) |
|
|
- [Meta Llama 3.2 3B Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) |
|
|
- [Gradio Documentation](https://gradio.app/docs/) |
|
|
- [Transformers Library](https://huggingface.co/docs/transformers) |
|
|
|
|
|
--- |
|
|
|
|
|
**¡Disfruta chateando con Llama! 🦙** |
|
|
''' |
|
|
|
|
|
|
|
|
config_py_content = '''# config.py - Configuración del Space |
|
|
|
|
|
import os |
|
|
|
|
|
class Config: |
|
|
"""Configuración centralizada para el Space""" |
|
|
|
|
|
# Modelo |
|
|
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" |
|
|
DEVICE = "cuda" if os.environ.get("SPACES_GPU") else "cpu" |
|
|
|
|
|
# Tokens y autenticación |
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
|
|
# Límites de generación |
|
|
MAX_TOKENS_LIMIT = 1024 |
|
|
MIN_TOKENS_LIMIT = 50 |
|
|
DEFAULT_MAX_TOKENS = 512 |
|
|
|
|
|
# Temperatura |
|
|
MAX_TEMPERATURE = 2.0 |
|
|
MIN_TEMPERATURE = 0.1 |
|
|
DEFAULT_TEMPERATURE = 0.7 |
|
|
|
|
|
# Cola y concurrencia |
|
|
MAX_QUEUE_SIZE = 10 |
|
|
QUEUE_TIMEOUT = 300 # 5 minutos |
|
|
|
|
|
# Context length |
|
|
MAX_CONTEXT_LENGTH = 2048 |
|
|
|
|
|
# Interface |
|
|
CHAT_HEIGHT = 500 |
|
|
DEFAULT_SYSTEM_PROMPT = "Eres un asistente de IA útil y amigable. Responde de manera clara y concisa." |
|
|
|
|
|
# API |
|
|
API_TIMEOUT = 300 |
|
|
ENABLE_API_LOGGING = True |
|
|
|
|
|
@classmethod |
|
|
def validate(cls): |
|
|
"""Validar configuración""" |
|
|
errors = [] |
|
|
|
|
|
if not cls.HF_TOKEN: |
|
|
errors.append("HF_TOKEN no configurado en variables de entorno") |
|
|
|
|
|
if cls.MAX_TOKENS_LIMIT < cls.MIN_TOKENS_LIMIT: |
|
|
errors.append("MAX_TOKENS_LIMIT debe ser mayor que MIN_TOKENS_LIMIT") |
|
|
|
|
|
if cls.MAX_TEMPERATURE < cls.MIN_TEMPERATURE: |
|
|
errors.append("MAX_TEMPERATURE debe ser mayor que MIN_TEMPERATURE") |
|
|
|
|
|
return errors |
|
|
|
|
|
@classmethod |
|
|
def get_model_config(cls): |
|
|
"""Configuración específica del modelo""" |
|
|
return { |
|
|
"torch_dtype": "float16" if cls.DEVICE == "cuda" else "float32", |
|
|
"device_map": "auto" if cls.DEVICE == "cuda" else None, |
|
|
"trust_remote_code": True, |
|
|
"token": cls.HF_TOKEN |
|
|
} |
|
|
|
|
|
@classmethod |
|
|
def get_generation_config(cls, max_tokens=None, temperature=None): |
|
|
"""Configuración de generación""" |
|
|
return { |
|
|
"max_new_tokens": max_tokens or cls.DEFAULT_MAX_TOKENS, |
|
|
"temperature": temperature or cls.DEFAULT_TEMPERATURE, |
|
|
"do_sample": True, |
|
|
"repetition_penalty": 1.1, |
|
|
"top_p": 0.9, |
|
|
"top_k": 50 |
|
|
} |
|
|
|
|
|
# Validar configuración al importar |
|
|
config_errors = Config.validate() |
|
|
if config_errors: |
|
|
print("⚠️ Errores de configuración:") |
|
|
for error in config_errors: |
|
|
print(f" - {error}") |
|
|
''' |
|
|
|
|
|
|
|
|
utils_py_content = '''# utils.py - Utilidades para el Space |
|
|
|
|
|
import time |
|
|
import functools |
|
|
import logging |
|
|
from typing import List, Dict, Callable, Any |
|
|
from datetime import datetime |
|
|
|
|
|
# Configurar logging |
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def timing_decorator(func: Callable) -> Callable: |
|
|
"""Decorator para medir tiempo de ejecución""" |
|
|
@functools.wraps(func) |
|
|
def wrapper(*args, **kwargs): |
|
|
start_time = time.time() |
|
|
result = func(*args, **kwargs) |
|
|
end_time = time.time() |
|
|
|
|
|
logger.info(f"{func.__name__} ejecutado en {end_time - start_time:.2f}s") |
|
|
return result |
|
|
return wrapper |
|
|
|
|
|
def sanitize_input(text: str, max_length: int = 2000) -> str: |
|
|
"""Sanitizar entrada del usuario""" |
|
|
if not isinstance(text, str): |
|
|
return "" |
|
|
|
|
|
# Truncar si es muy largo |
|
|
text = text[:max_length] |
|
|
|
|
|
# Limpiar caracteres problemáticos |
|
|
text = text.replace('\\x00', '') # Null bytes |
|
|
text = text.strip() |
|
|
|
|
|
return text |
|
|
|
|
|
def format_history(history: List[List[str]]) -> List[List[str]]: |
|
|
"""Formatear y validar historial de chat""" |
|
|
if not history: |
|
|
return [] |
|
|
|
|
|
formatted_history = [] |
|
|
for item in history: |
|
|
if isinstance(item, list) and len(item) == 2: |
|
|
user_msg = sanitize_input(str(item[0])) |
|
|
assistant_msg = sanitize_input(str(item[1])) |
|
|
|
|
|
if user_msg and assistant_msg: |
|
|
formatted_history.append([user_msg, assistant_msg]) |
|
|
|
|
|
# Limitar historial a últimas 10 conversaciones |
|
|
return formatted_history[-10:] |
|
|
|
|
|
def estimate_tokens(text: str) -> int: |
|
|
"""Estimación aproximada de tokens""" |
|
|
# Aproximación: ~4 caracteres por token en español |
|
|
return len(text) // 4 |
|
|
|
|
|
def validate_parameters(max_tokens: int, temperature: float) -> Dict[str, Any]: |
|
|
"""Validar parámetros de generación""" |
|
|
from config import Config |
|
|
|
|
|
errors = [] |
|
|
|
|
|
# Validar max_tokens |
|
|
if not isinstance(max_tokens, int): |
|
|
max_tokens = Config.DEFAULT_MAX_TOKENS |
|
|
errors.append("max_tokens debe ser un entero") |
|
|
elif max_tokens < Config.MIN_TOKENS_LIMIT: |
|
|
max_tokens = Config.MIN_TOKENS_LIMIT |
|
|
errors.append(f"max_tokens mínimo es {Config.MIN_TOKENS_LIMIT}") |
|
|
elif max_tokens > Config.MAX_TOKENS_LIMIT: |
|
|
max_tokens = Config.MAX_TOKENS_LIMIT |
|
|
errors.append(f"max_tokens máximo es {Config.MAX_TOKENS_LIMIT}") |
|
|
|
|
|
# Validar temperature |
|
|
if not isinstance(temperature, (int, float)): |
|
|
temperature = Config.DEFAULT_TEMPERATURE |
|
|
errors.append("temperature debe ser un número") |
|
|
elif temperature < Config.MIN_TEMPERATURE: |
|
|
temperature = Config.MIN_TEMPERATURE |
|
|
errors.append(f"temperature mínima es {Config.MIN_TEMPERATURE}") |
|
|
elif temperature > Config.MAX_TEMPERATURE: |
|
|
temperature = Config.MAX_TEMPERATURE |
|
|
errors.append(f"temperature máxima es {Config.MAX_TEMPERATURE}") |
|
|
|
|
|
return { |
|
|
"max_tokens": max_tokens, |
|
|
"temperature": float(temperature), |
|
|
"errors": errors |
|
|
} |
|
|
|
|
|
def create_error_response(error_msg: str) -> Dict[str, Any]: |
|
|
"""Crear respuesta de error estandarizada""" |
|
|
return { |
|
|
"response": f"Error: {error_msg}", |
|
|
"queue_status": { |
|
|
"queue_size": 0, |
|
|
"is_processing": False, |
|
|
"timestamp": datetime.now().isoformat(), |
|
|
"error": True |
|
|
} |
|
|
} |
|
|
|
|
|
def truncate_context(text: str, max_length: int = 1800) -> str: |
|
|
"""Truncar contexto manteniendo coherencia""" |
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
|
|
|
# Truncar por párrafos si es posible |
|
|
paragraphs = text.split('\\n\\n') |
|
|
truncated = "" |
|
|
|
|
|
for paragraph in paragraphs: |
|
|
if len(truncated + paragraph) <= max_length: |
|
|
truncated += paragraph + '\\n\\n' |
|
|
else: |
|
|
break |
|
|
|
|
|
# Si no hay párrafos, truncar por oraciones |
|
|
if not truncated: |
|
|
sentences = text.split('. ') |
|
|
for sentence in sentences: |
|
|
if len(truncated + sentence) <= max_length: |
|
|
truncated += sentence + '. ' |
|
|
else: |
|
|
break |
|
|
|
|
|
# Último recurso: truncar directamente |
|
|
if not truncated: |
|
|
truncated = text[:max_length] |
|
|
|
|
|
return truncated.strip() |
|
|
|
|
|
class PerformanceMonitor: |
|
|
"""Monitor de rendimiento simple""" |
|
|
|
|
|
def __init__(self): |
|
|
self.stats = { |
|
|
"total_requests": 0, |
|
|
"successful_requests": 0, |
|
|
"failed_requests": 0, |
|
|
"total_tokens_generated": 0, |
|
|
"average_response_time": 0, |
|
|
"start_time": datetime.now() |
|
|
} |
|
|
|
|
|
def record_request(self, success: bool, tokens_generated: int = 0, response_time: float = 0): |
|
|
"""Registrar una request""" |
|
|
self.stats["total_requests"] += 1 |
|
|
|
|
|
if success: |
|
|
self.stats["successful_requests"] += 1 |
|
|
self.stats["total_tokens_generated"] += tokens_generated |
|
|
else: |
|
|
self.stats["failed_requests"] += 1 |
|
|
|
|
|
# Actualizar tiempo promedio de respuesta |
|
|
if response_time > 0: |
|
|
current_avg = self.stats["average_response_time"] |
|
|
total_requests = self.stats["total_requests"] |
|
|
|
|
|
self.stats["average_response_time"] = ( |
|
|
(current_avg * (total_requests - 1) + response_time) / total_requests |
|
|
) |
|
|
|
|
|
def get_stats(self) -> Dict[str, Any]: |
|
|
"""Obtener estadísticas""" |
|
|
uptime = datetime.now() - self.stats["start_time"] |
|
|
|
|
|
return { |
|
|
**self.stats, |
|
|
"uptime_seconds": uptime.total_seconds(), |
|
|
"success_rate": ( |
|
|
self.stats["successful_requests"] / max(self.stats["total_requests"], 1) |
|
|
) * 100, |
|
|
"tokens_per_minute": ( |
|
|
self.stats["total_tokens_generated"] / max(uptime.total_seconds() / 60, 1) |
|
|
) |
|
|
} |
|
|
|
|
|
# Instancia global del monitor |
|
|
performance_monitor = PerformanceMonitor() |
|
|
''' |
|
|
|
|
|
|
|
|
with open("README.md", "w", encoding="utf-8") as f: |
|
|
f.write(readme_content) |
|
|
|
|
|
with open("config.py", "w", encoding="utf-8") as f: |
|
|
f.write(config_py_content) |
|
|
|
|
|
with open("utils.py", "w", encoding="utf-8") as f: |
|
|
f.write(utils_py_content) |
|
|
|
|
|
print("Archivos adicionales creados:") |
|
|
print("- README.md (instrucciones completas)") |
|
|
print("- config.py (configuración centralizada)") |
|
|
print("- utils.py (utilidades y monitoreo)") |
|
|
print("\\n¡Todo listo para subir a Hugging Face Spaces! 🚀") |