Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import PyPDF2 | |
| import pandas as pd | |
| import numpy as np | |
| import io | |
| import os | |
| import json | |
| import zipfile | |
| import tempfile | |
| from typing import Dict, List, Tuple, Union, Optional, Generator | |
| import re | |
| from pathlib import Path | |
| import openpyxl | |
| from dataclasses import dataclass, asdict | |
| from enum import Enum | |
| from docx import Document | |
| from docx.shared import Inches, Pt, RGBColor | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| from reportlab.lib import colors | |
| from reportlab.lib.pagesizes import letter, A4 | |
| from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak | |
| from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
| from reportlab.lib.units import inch | |
| from reportlab.pdfbase import pdfmetrics | |
| from reportlab.pdfbase.ttfonts import TTFont | |
| import matplotlib.pyplot as plt | |
| from datetime import datetime | |
| from openai import OpenAI | |
| # --- CONFIGURACIÓN Y CONSTANTES --- | |
| os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False' | |
| client = OpenAI( | |
| base_url="https://api.studio.nebius.com/v1/", | |
| api_key=os.environ.get("NEBIUS_API_KEY") | |
| ) | |
| # Se añade la nueva etiqueta para el selector de columna | |
| TRANSLATIONS = { | |
| 'en': { | |
| 'title': '🧬 Scalable Biotech Model Analyzer', | |
| 'subtitle': 'Analyzes large sets of model fitting results using a chunking strategy', | |
| 'upload_files': '📁 Upload fitting results (CSV/Excel)', | |
| 'chunk_column_label': '🔬 Select Column for Grouping Experiments', | |
| 'chunk_column_info': 'Choose the column that identifies each unique experiment. This is used for chunking.', | |
| 'select_model': '🤖 IA Model (editable)', | |
| 'select_language': '🌐 Language', | |
| 'select_theme': '🎨 Theme', | |
| 'detail_level': '📋 Analysis detail level', | |
| 'detailed': 'Detailed', | |
| 'summarized': 'Summarized', | |
| 'analyze_button': '🚀 Analyze and Compare Models', | |
| 'export_format': '📄 Export format', | |
| 'export_button': '💾 Export Report', | |
| 'comparative_analysis': '📊 Comparative Analysis', | |
| 'implementation_code': '💻 Implementation Code', | |
| 'data_format': '📋 Expected data format', | |
| 'loading': 'Loading...', | |
| 'error_no_api': 'Please configure NEBIUS_API_KEY in HuggingFace Space secrets', | |
| 'error_no_files': 'Please upload fitting result files to analyze', | |
| 'report_exported': 'Report exported successfully as', | |
| 'additional_specs': '📝 Additional specifications for analysis', | |
| 'additional_specs_placeholder': 'Add any specific requirements or focus areas for the analysis...', | |
| 'output_tokens_per_chunk': '🔢 Max output tokens per chunk (1k-32k)', | |
| 'token_info': 'ℹ️ Token usage information', | |
| 'input_token_count': 'Input tokens used', | |
| 'output_token_count': 'Output tokens used', | |
| 'total_token_count': 'Total tokens used', | |
| 'token_cost': 'Estimated cost', | |
| 'thinking_process': '🧠 Thinking Process', | |
| 'analysis_report': '📊 Analysis Report', | |
| 'code_output': '💻 Implementation Code', | |
| 'token_usage': '💰 Token Usage' | |
| }, | |
| 'es': { | |
| 'title': '🧬 Analizador Escalable de Modelos Biotecnológicos', | |
| 'subtitle': 'Analiza grandes conjuntos de datos de ajuste de modelos usando una estrategia por partes', | |
| 'upload_files': '📁 Subir resultados de ajuste (CSV/Excel)', | |
| 'chunk_column_label': '🔬 Seleccionar Columna para Agrupar Experimentos', | |
| 'chunk_column_info': 'Elige la columna que identifica cada experimento único. Se usará para dividir el análisis.', | |
| 'select_model': '🤖 Modelo IA (editable)', | |
| 'select_language': '🌐 Idioma', | |
| 'select_theme': '🎨 Tema', | |
| 'detail_level': '📋 Nivel de detalle del análisis', | |
| 'detailed': 'Detallado', | |
| 'summarized': 'Resumido', | |
| 'analyze_button': '🚀 Analizar y Comparar Modelos', | |
| 'export_format': '📄 Formato de exportación', | |
| 'export_button': '💾 Exportar Reporte', | |
| 'comparative_analysis': '📊 Análisis Comparativo', | |
| 'implementation_code': '💻 Código de Implementación', | |
| 'data_format': '📋 Formato de datos esperado', | |
| 'loading': 'Cargando...', | |
| 'error_no_api': 'Por favor configura NEBIUS_API_KEY en los secretos del Space', | |
| 'error_no_files': 'Por favor sube archivos con resultados de ajuste para analizar', | |
| 'report_exported': 'Reporte exportado exitosamente como', | |
| 'additional_specs': '📝 Especificaciones adicionales para el análisis', | |
| 'additional_specs_placeholder': 'Agregue cualquier requerimiento específico o áreas de enfoque para el análisis...', | |
| 'output_tokens_per_chunk': '🔢 Max tokens de salida por pieza (1k-32k)', | |
| 'token_info': 'ℹ️ Información de uso de tokens', | |
| 'input_token_count': 'Tokens de entrada usados', | |
| 'output_token_count': 'Tokens de salida usados', | |
| 'total_token_count': 'Total de tokens usados', | |
| 'token_cost': 'Costo estimado', | |
| 'thinking_process': '🧠 Proceso de Pensamiento', | |
| 'analysis_report': '📊 Reporte de Análisis', | |
| 'code_output': '💻 Código de Implementación', | |
| 'token_usage': '💰 Uso de Tokens' | |
| } | |
| } | |
| THEMES = { 'light': gr.themes.Soft(), 'dark': gr.themes.Base() } | |
| QWEN_MODELS = { | |
| "Qwen/Qwen3-14B": {"max_context_tokens": 40960, "input_cost": 0.0000007, "output_cost": 0.0000021}, | |
| "Qwen/Qwen3-7B": {"max_context_tokens": 40960, "input_cost": 0.00000035, "output_cost": 0.00000105}, | |
| "Qwen/Qwen1.5-14B": {"max_context_tokens": 40960, "input_cost": 0.0000007, "output_cost": 0.0000021} | |
| } | |
| # --- CLASES DE UTILIDAD (Se asume que existen, omitidas por brevedad) --- | |
| class FileProcessor: | |
| """Clase para procesar diferentes tipos de archivos""" | |
| def extract_text_from_pdf(pdf_file) -> str: | |
| """Extrae texto de un archivo PDF""" | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file)) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| return f"Error reading PDF: {str(e)}" | |
| def read_csv(csv_file) -> pd.DataFrame: | |
| """Lee archivo CSV""" | |
| try: | |
| return pd.read_csv(io.BytesIO(csv_file)) | |
| except Exception as e: | |
| return None | |
| def read_excel(excel_file) -> pd.DataFrame: | |
| """Lee archivo Excel""" | |
| try: | |
| return pd.read_excel(io.BytesIO(excel_file)) | |
| except Exception as e: | |
| return None | |
| def extract_from_zip(zip_file) -> List[Tuple[str, bytes]]: | |
| """Extrae archivos de un ZIP""" | |
| files = [] | |
| try: | |
| with zipfile.ZipFile(io.BytesIO(zip_file), 'r') as zip_ref: | |
| for file_name in zip_ref.namelist(): | |
| if not file_name.startswith('__MACOSX'): | |
| file_data = zip_ref.read(file_name) | |
| files.append((file_name, file_data)) | |
| except Exception as e: | |
| print(f"Error processing ZIP: {e}") | |
| return files | |
| class ReportExporter: | |
| """Clase para exportar reportes a diferentes formatos""" | |
| def export_to_docx(content: str, filename: str, language: str = 'en') -> str: | |
| """Exporta el contenido a un archivo DOCX""" | |
| doc = Document() | |
| # Configurar estilos | |
| title_style = doc.styles['Title'] | |
| title_style.font.size = Pt(24) | |
| title_style.font.bold = True | |
| heading_style = doc.styles['Heading 1'] | |
| heading_style.font.size = Pt(18) | |
| heading_style.font.bold = True | |
| # Título | |
| title_text = { | |
| 'en': 'Comparative Analysis Report - Biotechnological Models', | |
| 'es': 'Informe de Análisis Comparativo - Modelos Biotecnológicos', | |
| } | |
| doc.add_heading(title_text.get(language, title_text['en']), 0) | |
| # Fecha | |
| date_text = { | |
| 'en': 'Generated on', | |
| 'es': 'Generado el', | |
| } | |
| doc.add_paragraph(f"{date_text.get(language, date_text['en'])}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| doc.add_paragraph() | |
| # Procesar contenido | |
| lines = content.split('\n') | |
| current_paragraph = None | |
| for line in lines: | |
| line = line.strip() | |
| if line.startswith('###'): | |
| doc.add_heading(line.replace('###', '').strip(), level=2) | |
| elif line.startswith('##'): | |
| doc.add_heading(line.replace('##', '').strip(), level=1) | |
| elif line.startswith('#'): | |
| doc.add_heading(line.replace('#', '').strip(), level=0) | |
| elif line.startswith('**') and line.endswith('**'): | |
| # Texto en negrita | |
| p = doc.add_paragraph() | |
| run = p.add_run(line.replace('**', '')) | |
| run.bold = True | |
| elif line.startswith('- ') or line.startswith('* '): | |
| # Lista | |
| doc.add_paragraph(line[2:], style='List Bullet') | |
| elif line.startswith(tuple('0123456789')): | |
| # Lista numerada | |
| doc.add_paragraph(line, style='List Number') | |
| elif line == '---' or line.startswith('==='): | |
| # Separador | |
| doc.add_paragraph('_' * 50) | |
| elif line: | |
| # Párrafo normal | |
| doc.add_paragraph(line) | |
| # Guardar documento | |
| doc.save(filename) | |
| return filename | |
| def export_to_pdf(content: str, filename: str, language: str = 'en') -> str: | |
| """Exporta el contenido a un archivo PDF""" | |
| # Crear documento PDF | |
| doc = SimpleDocTemplate(filename, pagesize=letter) | |
| story = [] | |
| styles = getSampleStyleSheet() | |
| # Estilos personalizados | |
| title_style = ParagraphStyle( | |
| 'CustomTitle', | |
| parent=styles['Title'], | |
| fontSize=24, | |
| textColor=colors.HexColor('#1f4788'), | |
| spaceAfter=30 | |
| ) | |
| heading_style = ParagraphStyle( | |
| 'CustomHeading', | |
| parent=styles['Heading1'], | |
| fontSize=16, | |
| textColor=colors.HexColor('#2e5090'), | |
| spaceAfter=12 | |
| ) | |
| # Título | |
| title_text = { | |
| 'en': 'Comparative Analysis Report - Biotechnological Models', | |
| 'es': 'Informe de Análisis Comparativo - Modelos Biotecnológicos', | |
| } | |
| story.append(Paragraph(title_text.get(language, title_text['en']), title_style)) | |
| # Fecha | |
| date_text = { | |
| 'en': 'Generated on', | |
| 'es': 'Generado el', | |
| } | |
| story.append(Paragraph(f"{date_text.get(language, date_text['en'])}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", styles['Normal'])) | |
| story.append(Spacer(1, 0.5*inch)) | |
| # Procesar contenido | |
| lines = content.split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| story.append(Spacer(1, 0.2*inch)) | |
| elif line.startswith('###'): | |
| story.append(Paragraph(line.replace('###', '').strip(), styles['Heading3'])) | |
| elif line.startswith('##'): | |
| story.append(Paragraph(line.replace('##', '').strip(), styles['Heading2'])) | |
| elif line.startswith('#'): | |
| story.append(Paragraph(line.replace('#', '').strip(), heading_style)) | |
| elif line.startswith('**') and line.endswith('**'): | |
| text = line.replace('**', '') | |
| story.append(Paragraph(f"<b>{text}</b>", styles['Normal'])) | |
| elif line.startswith('- ') or line.startswith('* '): | |
| story.append(Paragraph(f"• {line[2:]}", styles['Normal'])) | |
| elif line == '---' or line.startswith('==='): | |
| story.append(Spacer(1, 0.3*inch)) | |
| story.append(Paragraph("_" * 70, styles['Normal'])) | |
| story.append(Spacer(1, 0.3*inch)) | |
| else: | |
| # Limpiar caracteres especiales para PDF | |
| clean_line = line.replace('📊', '[GRAPH]').replace('🎯', '[TARGET]').replace('🔍', '[SEARCH]').replace('💡', '[TIP]') | |
| story.append(Paragraph(clean_line, styles['Normal'])) | |
| # Construir PDF | |
| doc.build(story) | |
| return filename | |
| # --- CLASE AIAnalyzer (MODIFICADA PARA ACEPTAR chunk_column) --- | |
| class AIAnalyzer: | |
| """Clase para análisis con IA que implementa una estrategia 'chunk-and-stitch'.""" | |
| def __init__(self, client): | |
| self.client = client | |
| self.token_usage = {} | |
| self.reset_token_usage() | |
| def reset_token_usage(self): | |
| self.token_usage = {'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0, 'estimated_cost': 0.0} | |
| def _update_token_usage(self, model_name: str, usage): | |
| if not usage: return | |
| self.token_usage['input_tokens'] += usage.prompt_tokens | |
| self.token_usage['output_tokens'] += usage.completion_tokens | |
| self.token_usage['total_tokens'] += usage.total_tokens | |
| model_info = QWEN_MODELS.get(model_name, {}) | |
| input_cost = model_info.get('input_cost', 0.0) | |
| output_cost = model_info.get('output_cost', 0.0) | |
| self.token_usage['estimated_cost'] += (usage.prompt_tokens * input_cost) + (usage.completion_tokens * output_cost) | |
| def _calculate_safe_max_tokens(self, model_name: str, user_requested_tokens: int) -> int: | |
| model_info = QWEN_MODELS.get(model_name, {"max_context_tokens": 32768}) | |
| context_limit = model_info['max_context_tokens'] | |
| PROMPT_SAFETY_MARGIN = 8192 | |
| max_allowable_output = context_limit - PROMPT_SAFETY_MARGIN | |
| return max(100, min(user_requested_tokens, max_allowable_output)) | |
| #### | |
| def _analyze_single_experiment(self, experiment_df: pd.DataFrame, experiment_id: str, qwen_model: str, lang_prefix: str, max_output_tokens: int) -> Optional[Dict]: | |
| """ | |
| Analiza los resultados de un único experimento (un 'chunk' de datos) y devuelve un JSON estructurado. | |
| Esta función es el núcleo de la estrategia 'map' en el enfoque 'map-reduce'. | |
| """ | |
| # El prompt es la parte más importante. Está diseñado para ser muy específico y dar un ejemplo claro. | |
| prompt = f""" | |
| {lang_prefix} | |
| You are an expert biotechnological data analyst. Your task is to analyze the provided model fitting results for a single experiment identified as: '{experiment_id}'. | |
| The data contains different mathematical models that were fitted to experimental data for variables like Biomass, Substrate, or Product. | |
| DATA FOR THIS SPECIFIC EXPERIMENT ('{experiment_id}'): | |
| ``` | |
| {experiment_df.to_string()} | |
| ``` | |
| YOUR INSTRUCTIONS: | |
| 1. **Identify Best Models**: For EACH variable type present in the data (e.g., 'Biomass', 'Substrate'), determine the single best-performing model. The best model is the one with the highest R² value. If R² values are equal, use the lowest RMSE as a tie-breaker. | |
| 2. **Extract Key Information**: For each of these best models, you must extract: | |
| - The model's name. | |
| - The specific metrics (R², RMSE, AIC, etc.) as key-value pairs. | |
| - All kinetic parameters and their fitted values (e.g., mu_max, Ks) as key-value pairs. | |
| 3. **Summarize All Tested Models**: Create a simple list of the names of ALL models that were tested in this experiment, regardless of their performance. | |
| 4. **Provide Biological Interpretation**: Write a brief, concise interpretation (2-3 sentences) of what the results for this specific experiment imply. For example, "The selection of the Monod model for biomass with a µ_max of 0.45 suggests rapid growth under these conditions, while the high R² indicates a strong fit." | |
| **CRITICAL OUTPUT FORMAT**: You MUST respond ONLY with a single, valid JSON object. Do not add any explanatory text, markdown formatting, or anything else before or after the JSON structure. | |
| Follow this EXACT JSON structure: | |
| {{ | |
| "experiment_id": "{experiment_id}", | |
| "best_models_by_variable": [ | |
| {{ | |
| "variable_type": "Biomass", | |
| "model_name": "Name of the best model for Biomass", | |
| "metrics": {{ | |
| "R2": 0.99, | |
| "RMSE": 0.01, | |
| "AIC": -50.2 | |
| }}, | |
| "parameters": {{ | |
| "mu_max": 0.5, | |
| "Ks": 10.2 | |
| }} | |
| }}, | |
| {{ | |
| "variable_type": "Substrate", | |
| "model_name": "Name of the best model for Substrate", | |
| "metrics": {{ | |
| "R2": 0.98, | |
| "RMSE": 0.05 | |
| }}, | |
| "parameters": {{ | |
| "k_consumption": 1.5 | |
| }} | |
| }} | |
| ], | |
| "all_tested_models": ["Monod", "Logistic", "Gompertz", "First_Order"], | |
| "interpretation": "A brief, data-driven interpretation of the kinetic behavior observed in this specific experiment." | |
| }} | |
| """ | |
| try: | |
| # Calcular un número seguro de tokens de salida para evitar exceder el límite de contexto del modelo. | |
| safe_max_tokens = self._calculate_safe_max_tokens(qwen_model, max_output_tokens) | |
| # Realizar la llamada a la API de OpenAI/Nebius | |
| response = self.client.chat.completions.create( | |
| model=qwen_model, | |
| max_tokens=safe_max_tokens, | |
| temperature=0.05, # Temperatura baja para una salida más predecible y estructurada | |
| response_format={"type": "json_object"}, # Forza la salida a ser un JSON válido | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant designed to output JSON."}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| ) | |
| # Actualizar el contador de tokens y el costo estimado. | |
| self._update_token_usage(qwen_model, response.usage) | |
| # Extraer el contenido de la respuesta. | |
| content = response.choices[0].message.content | |
| # Parsear la cadena de texto JSON a un diccionario de Python. | |
| # Este paso es propenso a errores si el LLM no sigue las instrucciones perfectamente. | |
| parsed_json = json.loads(content) | |
| return parsed_json | |
| except json.JSONDecodeError as e: | |
| # Capturar errores si la respuesta del modelo no es un JSON válido. | |
| print(f"CRITICAL ERROR: Failed to decode JSON for experiment '{experiment_id}'.") | |
| print(f"JSONDecodeError: {e}") | |
| print(f"LLM Raw Output that caused the error:\n---\n{content}\n---") | |
| return None # Devolver None para indicar que el análisis de este chunk falló. | |
| except Exception as e: | |
| # Capturar otros errores de la API (ej. problemas de red, clave inválida, etc.). | |
| print(f"API Error during single analysis for experiment '{experiment_id}': {e}") | |
| return None # Devolver None para que el proceso principal pueda saltar este chunk. | |
| #### | |
| def _synthesize_comparative_analysis(self, individual_analyses: List[Dict], qwen_model: str, detail_level: str, lang_prefix: str, additional_specs: str, max_output_tokens: int) -> str: | |
| """ | |
| Sintetiza los análisis individuales (JSONs) en un reporte comparativo final en formato Markdown. | |
| Esta es la etapa 'reduce' del proceso. | |
| """ | |
| # 1. Preparar los datos de entrada para el modelo. | |
| # Convertimos la lista de diccionarios de Python a una cadena de texto JSON bien formateada. | |
| # Esto es lo que el LLM verá como su "base de conocimiento". | |
| analyses_summary = json.dumps(individual_analyses, indent=2) | |
| # 2. Construir el prompt de síntesis. | |
| # Este prompt es más conceptual que el anterior. Le pide al modelo que actúe como un científico senior. | |
| # Sección para las especificaciones adicionales del usuario. | |
| user_specs_section = f""" | |
| ## User's Additional Specifications | |
| Please pay special attention to the following user-provided requirements during your analysis: | |
| - {additional_specs} | |
| """ if additional_specs else "" | |
| # Instrucción de nivel de detalle basada en la selección del usuario. | |
| detail_instruction = ( | |
| "Your report must be highly detailed and exhaustive. Include multiple tables, in-depth parameter comparisons, and nuanced biological interpretations." | |
| if detail_level == "detailed" else | |
| "Your report should be a high-level summary. Focus on the main conclusions and key takeaways, using concise tables and bullet points." | |
| ) | |
| prompt = f""" | |
| {lang_prefix} | |
| You are a Principal Scientist tasked with creating a final, consolidated report from a series of individual experimental analyses. | |
| You have been provided with a JSON array, where each object represents the detailed analysis of one specific experiment. | |
| {user_specs_section} | |
| YOUR PRIMARY OBJECTIVE: | |
| Synthesize all the provided information into a single, cohesive, and comparative analysis report. The report must be written in rich Markdown format. | |
| {detail_instruction} | |
| Your final report MUST contain the following sections: | |
| ### 1. Executive Summary & Experimental Inventory | |
| - Start with a brief paragraph summarizing the scope of the experiments analyzed. | |
| - Create a Markdown table that serves as an inventory of all experiments. The table should list each `experiment_id`, the `variable_type` (e.g., Biomass), and the `model_name` of the best-performing model for that variable. | |
| ### 2. In-Depth Comparative Analysis | |
| - **Model Performance Matrix:** This is the most critical part. Create a Markdown table that compares the performance of all major models across all experiments. Use R² as the primary metric. Rows should be model names, and columns should be experiment IDs. This allows for a direct visual comparison of which models are robust across different conditions. | |
| - **Parameter Trend Analysis:** Analyze how key kinetic parameters (e.g., `mu_max`, `Ks`, etc.) change across the different experimental conditions. Discuss any observable trends, correlations, or significant differences. For example: "We observed that `mu_max` consistently increased as temperature rose from Exp_A to Exp_C, suggesting a direct correlation in this range." | |
| - **Model Selection Justification:** Discuss why certain models performed better under specific conditions, referencing the biological interpretations from the input data. | |
| ### 3. Overall Recommendations & Conclusions | |
| - **Globally Recommended Models:** Based on the entire dataset, declare the best overall model for each primary variable type (Biomass, Substrate, etc.). Justify your choice based on consistent high performance and robustness across experiments. | |
| - **Condition-Specific Guidelines:** Provide actionable recommendations. For example, "For experiments conducted under high pH conditions (similar to 'Exp_C'), the 'Gompertz' model is strongly recommended due to its superior fit." | |
| - **Suggestions for Future Research:** Briefly suggest a few next steps or potential experiments to validate the findings or explore new hypotheses. | |
| --- | |
| **INPUT DATA: JSON ARRAY OF INDIVIDUAL ANALYSES** | |
| ```json | |
| {analyses_summary} | |
| ``` | |
| --- | |
| Now, generate the complete, final Markdown report based on these instructions. | |
| """ | |
| try: | |
| # Aumentamos el número de tokens de salida solicitados para la etapa de síntesis, | |
| # ya que el reporte final puede ser largo. Se multiplica por 2 como heurística. | |
| safe_max_tokens = self._calculate_safe_max_tokens(qwen_model, max_output_tokens * 2) | |
| # Realizar la llamada a la API | |
| response = self.client.chat.completions.create( | |
| model=qwen_model, | |
| max_tokens=safe_max_tokens, | |
| temperature=0.2, # Una temperatura ligeramente más alta que en el análisis individual para permitir más creatividad en la redacción. | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ] | |
| ) | |
| # Actualizar el uso de tokens y el costo. | |
| self._update_token_usage(qwen_model, response.usage) | |
| # Devolver el contenido del reporte generado. | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| # Manejar cualquier error durante la llamada a la API de síntesis. | |
| error_message = f"CRITICAL ERROR: Failed during the final report synthesis stage. Details: {e}" | |
| print(error_message) | |
| return error_message | |
| # --- DENTRO DE LA CLASE AIAnalyzer --- | |
| def analyze_data(self, data: pd.DataFrame, chunk_column: str, qwen_model: str, detail_level: str, language: str, additional_specs: str, max_output_tokens: int) -> Generator[Union[str, Dict], None, None]: | |
| """ | |
| Orquesta el análisis completo como un generador, produciendo actualizaciones de estado. | |
| """ | |
| self.reset_token_usage() | |
| if chunk_column not in data.columns: | |
| yield {"error": f"The selected chunking column '{chunk_column}' was not found in the data."} | |
| return | |
| unique_experiments = data[chunk_column].unique() | |
| yield f"Identified {len(unique_experiments)} groups to analyze using column '{chunk_column}': {list(unique_experiments)}" | |
| individual_results = [] | |
| lang_prefix = "Please respond in English. " if language == 'en' else "Por favor responde en español. " | |
| for i, exp_id in enumerate(unique_experiments): | |
| yield f"({i+1}/{len(unique_experiments)}) Analyzing group: '{str(exp_id)}'..." | |
| experiment_df = data[data[chunk_column] == exp_id] | |
| result = self._analyze_single_experiment(experiment_df, str(exp_id), qwen_model, lang_prefix, max_output_tokens) | |
| if result: | |
| individual_results.append(result) | |
| yield f"✅ Analysis for '{str(exp_id)}' complete." | |
| else: | |
| yield f"⚠️ Failed to analyze '{str(exp_id)}'. Skipping." | |
| if not individual_results: | |
| yield {"error": "Could not analyze any of the data groups. Please check data format and API status."} | |
| return | |
| yield "All groups analyzed. Synthesizing final comparative report..." | |
| final_analysis = self._synthesize_comparative_analysis( | |
| individual_results, qwen_model, detail_level, lang_prefix, additional_specs, max_output_tokens | |
| ) | |
| yield "✅ Final report generated." | |
| yield "Generating implementation code..." | |
| code_result = "# Code generation is a placeholder in this version." | |
| yield "✅ Code generated." | |
| # Al final, produce el diccionario de resultados completo. | |
| yield { | |
| "analisis_completo": final_analysis, | |
| "codigo_implementacion": code_result, | |
| } | |
| # --- FUNCIÓN DE PROCESAMIENTO PRINCIPAL --- | |
| # --- FUNCIÓN DE PROCESAMIENTO PRINCIPAL (fuera de cualquier clase) --- | |
| def process_files_and_analyze(files, chunk_column: str, qwen_model: str, detail_level: str, language: str, additional_specs: str, max_output_tokens: int): | |
| """ | |
| Procesa archivos subidos y orquesta el análisis, actualizando la UI con 'yield'. | |
| """ | |
| if not files: | |
| yield "Please upload a file first.", "", "", "" | |
| return | |
| if not chunk_column: | |
| yield "Please upload a file and select a column for grouping before analyzing.", "", "", "" | |
| return | |
| # Inicializa las variables que se irán actualizando. | |
| thinking_log = ["### 🚀 Starting Analysis\n"] | |
| analysis_result, code_result, token_report = "", "", "" | |
| # Función auxiliar para actualizar el log y hacer yield a la UI | |
| def update_log_and_yield(message): | |
| nonlocal thinking_log | |
| thinking_log.append(f"- {datetime.now().strftime('%H:%M:%S')}: {message}\n") | |
| return "\n".join(thinking_log), gr.update(), gr.update(), gr.update() | |
| yield update_log_and_yield("Processing uploaded file...") | |
| file = files[0] | |
| try: | |
| df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name) | |
| yield update_log_and_yield(f"Successfully loaded data from '{Path(file.name).name}'.") | |
| except Exception as e: | |
| yield update_log_and_yield(f"Error reading file: {e}") | |
| return | |
| # Inicia el analizador | |
| analyzer = AIAnalyzer(client) | |
| # Itera sobre el generador `analyze_data` | |
| # Cada 'item' será una actualización de estado (string) o el resultado final (dict) | |
| for item in analyzer.analyze_data(df, chunk_column, qwen_model, detail_level, language, additional_specs, max_output_tokens): | |
| if isinstance(item, str): | |
| # Es una actualización de estado, actualizamos el log de "thinking" | |
| yield update_log_and_yield(item) | |
| elif isinstance(item, dict) and "error" in item: | |
| # Es un diccionario de error, terminamos el proceso. | |
| yield update_log_and_yield(f"ANALYSIS FAILED: {item['error']}") | |
| return | |
| elif isinstance(item, dict): | |
| # Es el diccionario de resultados final. | |
| analysis_result = item["analisis_completo"] | |
| code_result = item["codigo_implementacion"] | |
| # Almacenar en el estado global para la exportación | |
| app_state.current_analysis = analysis_result | |
| app_state.current_code = code_result | |
| # Formatear el reporte de tokens final | |
| t = TRANSLATIONS[language] | |
| token_info = analyzer.token_usage | |
| token_report = f""" | |
| ### {t['token_info']} | |
| - **{t['input_token_count']}:** {token_info['input_tokens']} | |
| - **{t['output_token_count']}:** {token_info['output_tokens']} | |
| - **{t['total_token_count']}:** {token_info['total_tokens']} | |
| - **{t['token_cost']}:** ${token_info['estimated_cost']:.6f} | |
| """ | |
| # Hacemos un último yield con todos los resultados finales. | |
| yield "\n".join(thinking_log), analysis_result, code_result, token_report | |
| # --- ESTADO Y FUNCIONES DE UTILIDAD PARA LA UI --- | |
| class AppState: | |
| def __init__(self): | |
| self.current_analysis = "" | |
| self.current_code = "" | |
| self.current_language = "en" | |
| app_state = AppState() | |
| app = None | |
| def export_report(export_format: str, language: str) -> Tuple[str, Optional[str]]: | |
| """ | |
| Exporta el reporte al formato seleccionado (DOCX o PDF) usando el estado global. | |
| Crea el archivo en un directorio temporal para evitar saturar el directorio de trabajo. | |
| """ | |
| # 1. Verificar si hay contenido para exportar en el estado global. | |
| if not app_state.current_analysis: | |
| error_msg = TRANSLATIONS[language].get('error_no_files', 'No analysis available to export.') | |
| # Devuelve el mensaje de error y None para la ruta del archivo. | |
| return error_msg, None | |
| # 2. Generar un nombre de archivo único con marca de tiempo. | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| # 3. Crear un directorio temporal para almacenar el reporte. | |
| # Esto es una buena práctica para no llenar el directorio raíz de la aplicación. | |
| try: | |
| temp_dir = tempfile.mkdtemp() | |
| except Exception as e: | |
| return f"Error creating temporary directory: {e}", None | |
| # 4. Construir la ruta completa del archivo y llamar al exportador correspondiente. | |
| try: | |
| if export_format == "DOCX": | |
| # Construye la ruta para el archivo .docx | |
| filename = os.path.join(temp_dir, f"biotech_analysis_report_{timestamp}.docx") | |
| # Llama al método estático de la clase ReportExporter para crear el DOCX. | |
| # Se asume que ReportExporter está definido en otra parte del código. | |
| ReportExporter.export_to_docx( | |
| content=app_state.current_analysis, | |
| filename=filename, | |
| language=language | |
| ) | |
| elif export_format == "PDF": | |
| # Construye la ruta para el archivo .pdf | |
| filename = os.path.join(temp_dir, f"biotech_analysis_report_{timestamp}.pdf") | |
| # Llama al método estático de la clase ReportExporter para crear el PDF. | |
| # Se asume que ReportExporter está definido en otra parte del código. | |
| ReportExporter.export_to_pdf( | |
| content=app_state.current_analysis, | |
| filename=filename, | |
| language=language | |
| ) | |
| else: | |
| # Manejar un caso improbable de formato no soportado. | |
| return f"Unsupported export format: {export_format}", None | |
| # 5. Si la creación del archivo fue exitosa, devolver un mensaje de éxito y la ruta al archivo. | |
| success_msg_template = TRANSLATIONS[language].get('report_exported', 'Report exported successfully as') | |
| success_msg = f"{success_msg_template} {os.path.basename(filename)}" | |
| return success_msg, filename | |
| except Exception as e: | |
| # 6. Si ocurre cualquier error durante la exportación (ej. falta de permisos, error en la librería), | |
| # capturarlo y devolver un mensaje de error claro. | |
| error_message = f"Error during report export to {export_format}: {str(e)}" | |
| print(f"EXPORT ERROR: {error_message}") # Loguear el error en la consola para depuración. | |
| return error_message, None | |
| # --- INTERFAZ GRADIU COMPLETA --- | |
| def create_interface(): | |
| global app | |
| def update_interface_language(language): | |
| app_state.current_language = language | |
| t = TRANSLATIONS[language] | |
| return [ | |
| gr.update(value=f"# {t['title']}"), gr.update(value=t['subtitle']), | |
| gr.update(label=t['upload_files']), gr.update(label=t['chunk_column_label'], info=t['chunk_column_info']), | |
| gr.update(label=t['select_model']), gr.update(label=t['select_language']), gr.update(label=t['select_theme']), | |
| gr.update(label=t['detail_level']), gr.update(choices=[(t['detailed'], "detailed"), (t['summarized'], "summarized")]), | |
| gr.update(label=t['additional_specs'], placeholder=t['additional_specs_placeholder']), | |
| gr.update(label=t['output_tokens_per_chunk']), gr.update(value=t['analyze_button']), | |
| gr.update(label=t['export_format']), gr.update(value=t['export_button']), | |
| gr.update(label=t['thinking_process']), gr.update(label=t['analysis_report']), | |
| gr.update(label=t['code_output']), gr.update(label=t['token_usage']), gr.update(label=t['data_format']) | |
| ] | |
| with gr.Blocks(theme=THEMES['light'], title="Scalable Biotech Analyzer") as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| title_text = gr.Markdown(f"# {TRANSLATIONS['en']['title']}") | |
| subtitle_text = gr.Markdown(TRANSLATIONS['en']['subtitle']) | |
| with gr.Column(scale=1): | |
| language_selector = gr.Dropdown(choices=[("English", "en"), ("Español", "es")], value="en", label="Language/Idioma") | |
| theme_selector = gr.Dropdown(choices=["light", "dark"], value="light", label="Theme/Tema") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| files_input = gr.File(label=TRANSLATIONS['en']['upload_files'], file_count="multiple", type="filepath") | |
| # NUEVO COMPONENTE: Selector de columna de agrupación | |
| chunk_column_selector = gr.Dropdown( | |
| label=TRANSLATIONS['en']['chunk_column_label'], | |
| info=TRANSLATIONS['en']['chunk_column_info'], | |
| interactive=False # Se activa al subir archivo | |
| ) | |
| model_selector = gr.Textbox(label=TRANSLATIONS['en']['select_model'], value="deepseek-ai/DeepSeek-V3-0324") | |
| detail_level_radio = gr.Radio(choices=[("Detailed", "detailed"), ("Summarized", "summarized")], value="detailed", label=TRANSLATIONS['en']['detail_level']) | |
| additional_specs = gr.Textbox(label=TRANSLATIONS['en']['additional_specs'], placeholder=TRANSLATIONS['en']['additional_specs_placeholder'], lines=3) | |
| output_tokens_slider = gr.Slider(minimum=1000, maximum=32000, value=4000, step=500, label=TRANSLATIONS['en']['output_tokens_per_chunk']) | |
| analyze_btn = gr.Button(TRANSLATIONS['en']['analyze_button'], variant="primary", interactive=False) # Desactivado por defecto | |
| gr.Markdown("---") | |
| export_format_radio = gr.Radio(choices=["DOCX", "PDF"], value="PDF", label=TRANSLATIONS['en']['export_format']) | |
| export_btn = gr.Button(TRANSLATIONS['en']['export_button']) | |
| export_status = gr.Textbox(label="Export Status", visible=False) | |
| export_file = gr.File(label="Download Report", visible=False) | |
| with gr.Column(scale=2): | |
| thinking_output = gr.Markdown(label=TRANSLATIONS['en']['thinking_process']) | |
| analysis_output = gr.Markdown(label=TRANSLATIONS['en']['analysis_report']) | |
| code_output = gr.Code(label=TRANSLATIONS['en']['code_output'], language="python") | |
| token_usage_output = gr.Markdown(label=TRANSLATIONS['en']['token_usage']) | |
| data_format_accordion = gr.Accordion(label=TRANSLATIONS['en']['data_format'], open=False) | |
| with data_format_accordion: | |
| gr.Markdown("""...""") # Contenido del acordeón sin cambios | |
| # --- LÓGICA DE EVENTOS DE LA UI --- | |
| # NUEVO EVENTO: Se activa al subir un archivo para poblar el selector de columna | |
| def update_chunk_column_selector(files): | |
| if not files: | |
| return gr.update(choices=[], value=None, interactive=False), gr.update(interactive=False) | |
| try: | |
| file_path = files[0].name | |
| df = pd.read_csv(file_path, nrows=0) if file_path.endswith('.csv') else pd.read_excel(file_path, nrows=0) | |
| columns = df.columns.tolist() | |
| # Intenta encontrar una columna por defecto | |
| default_candidates = ['Experiment', 'Experimento', 'Condition', 'Run', 'Batch', 'ID'] | |
| default_selection = next((col for col in default_candidates if col in columns), None) | |
| return gr.update(choices=columns, value=default_selection, interactive=True), gr.update(interactive=True) | |
| except Exception as e: | |
| gr.Warning(f"Could not read columns from file: {e}") | |
| return gr.update(choices=[], value=None, interactive=False), gr.update(interactive=False) | |
| files_input.upload( | |
| fn=update_chunk_column_selector, | |
| inputs=[files_input], | |
| outputs=[chunk_column_selector, analyze_btn] | |
| ) | |
| analyze_btn.click( | |
| fn=process_files_and_analyze, | |
| inputs=[files_input, chunk_column_selector, model_selector, detail_level_radio, language_selector, additional_specs, output_tokens_slider], | |
| outputs=[thinking_output, analysis_output, code_output, token_usage_output] | |
| ) | |
| # Eventos de idioma y exportación (sin cambios) | |
| language_selector.change( | |
| fn=update_interface_language, | |
| inputs=[language_selector], | |
| outputs=[title_text, subtitle_text, files_input, chunk_column_selector, model_selector, language_selector, theme_selector, detail_level_radio, detail_level_radio, additional_specs, output_tokens_slider, analyze_btn, export_format_radio, export_btn, thinking_output, analysis_output, code_output, token_usage_output, data_format_accordion] | |
| ) | |
| export_btn.click(fn=export_report, inputs=[export_format_radio, language_selector], outputs=[export_status, export_file]) | |
| app = demo | |
| return demo | |
| # --- FUNCIÓN PRINCIPAL DE EJECUCIÓN --- | |
| def main(): | |
| if not os.getenv("NEBIUS_API_KEY"): | |
| return gr.Interface(lambda: TRANSLATIONS['en']['error_no_api'], [], gr.Textbox(label="Configuration Error")) | |
| return create_interface() | |
| if __name__ == "__main__": | |
| demo = main() | |
| if demo: | |
| print("===== Application Startup =====") | |
| demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=False, inbrowser=True) |