Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import spacy | |
| from textblob import TextBlob | |
| from transformers import pipeline | |
| from langdetect import detect, DetectorFactory | |
| from functools import lru_cache | |
| DetectorFactory.seed = 0 # deterministic langdetect | |
| APP_TITLE = "🚀 Análisis Épico de Sentimientos (Multimodelo + Lingüística)" | |
| # ============================== | |
| # Carga perezosa (lazy) de modelos | |
| # ============================== | |
| def load_spacy(): | |
| try: | |
| nlp = spacy.load("es_core_news_sm") | |
| return nlp, "✅ spaCy (es_core_news_sm)" | |
| except Exception as e: | |
| return None, f"❌ spaCy no disponible: {e}" | |
| def load_multilingual_sentiment(): | |
| try: | |
| clf = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis") | |
| return clf, "✅ Multilingual Sentiment cargado" | |
| except Exception as e: | |
| return None, f"❌ Multilingual Sentiment no disponible: {e}" | |
| def load_multilingual_bert(): | |
| try: | |
| clf = pipeline("sentiment-analysis", | |
| model="nlptown/bert-base-multilingual-uncased-sentiment", | |
| tokenizer="nlptown/bert-base-multilingual-uncased-sentiment") | |
| return clf, "✅ BERT Multilingual (estrellas) cargado" | |
| except Exception as e: | |
| return None, f"❌ BERT Multilingual no disponible: {e}" | |
| # ============================== | |
| # Léxico sencillo ES + resaltado | |
| # ============================== | |
| PAL_POS = { | |
| 'bueno','excelente','fantástico','maravilloso','perfecto','genial', | |
| 'increíble','amo','encanta','feliz','contento','satisfecho','agradable', | |
| 'recomiendo','magnífico','extraordinario','asombroso','estupendo', | |
| 'óptimo','superior','inmejorable','ideal','brutal','espectacular' | |
| } | |
| PAL_NEG = { | |
| 'malo','terrible','horrible','pésimo','odio','decepcionado','fatal', | |
| 'triste','enojado','frustrado','pobre','deficiente','desastroso', | |
| 'insatisfecho','decepcionante','horroroso','malísimo','inútil', | |
| 'defectuoso','deplorable','lamentable','desagradable' | |
| } | |
| def lexical_score(text, nlp): | |
| text_low = text.lower().strip() | |
| if not nlp: | |
| # fallback básico sin lematizar | |
| tokens = [t for t in ''.join([c if c.isalpha() or c.isspace() else ' ' for c in text_low]).split() if len(t)>2] | |
| lemmas = tokens | |
| else: | |
| doc = nlp(text_low) | |
| lemmas = [t.lemma_ for t in doc if t.is_alpha and len(t) > 2] | |
| pos = sum(1 for w in lemmas if w in PAL_POS) | |
| neg = sum(1 for w in lemmas if w in PAL_NEG) | |
| total = max(1, len(lemmas)) | |
| raw = (pos - neg) / total | |
| norm = max(-1.0, min(1.0, raw * 5)) | |
| return {"positivas": pos, "negativas": neg, "total": total, "normalized_score": norm, "lemmas": lemmas} | |
| def highlight_words(text, nlp): | |
| # Resalta palabras del léxico en el texto original | |
| if not text: | |
| return "" | |
| original = text | |
| if nlp: | |
| doc = nlp(original) | |
| tokens = [t.text for t in doc] | |
| else: | |
| tokens = original.split() | |
| def wrap(tok): | |
| low = tok.lower() | |
| if low in PAL_POS: | |
| return f"<mark style='background:#D1FAE5; padding:2px 4px; border-radius:4px'>+{tok}</mark>" | |
| if low in PAL_NEG: | |
| return f"<mark style='background:#FEE2E2; padding:2px 4px; border-radius:4px'>-{tok}</mark>" | |
| return tok | |
| return " ".join(wrap(t) for t in tokens) | |
| # ============================== | |
| # Sentimiento por modelos | |
| # ============================== | |
| STAR_MAP = {'1 star': -1.0, '2 stars': -0.5, '3 stars': 0.0, '4 stars': 0.5, '5 stars': 1.0} | |
| def model_scores(text): | |
| out = {} | |
| clf1, status1 = load_multilingual_sentiment() | |
| clf2, status2 = load_multilingual_bert() | |
| nlp, _ = load_spacy() | |
| # Multilingual Sentiment | |
| if clf1: | |
| try: | |
| r = clf1(text)[0] | |
| out['multilingual'] = { | |
| "label": r['label'], "score": float(r['score']), | |
| "normalized_score": float(r['score']) if r['label']=='POSITIVE' else -float(r['score']) | |
| } | |
| except Exception as e: | |
| out['multilingual'] = {"error": str(e)} | |
| else: | |
| out['multilingual'] = {"error": status1} | |
| # BERT estrellas | |
| if clf2: | |
| try: | |
| r = clf2(text)[0] | |
| out['bert'] = { | |
| "label": r['label'], "score": float(r.get('score', 0.0)), | |
| "normalized_score": float(STAR_MAP.get(r['label'], 0.0)) | |
| } | |
| except Exception as e: | |
| out['bert'] = {"error": str(e)} | |
| else: | |
| out['bert'] = {"error": status2} | |
| # Léxico | |
| try: | |
| out['lexico'] = lexical_score(text, nlp) | |
| except Exception as e: | |
| out['lexico'] = {"error": str(e)} | |
| # TextBlob | |
| try: | |
| blob = TextBlob(text) | |
| out['textblob'] = { | |
| "polarity": float(blob.sentiment.polarity), | |
| "subjectivity": float(blob.sentiment.subjectivity), | |
| "normalized_score": float(blob.sentiment.polarity) | |
| } | |
| except Exception as e: | |
| out['textblob'] = {"error": str(e)} | |
| return out | |
| def fuse_scores(results, w_multi=0.4, w_bert=0.3, w_lex=0.2, w_tb=0.1, thr=0.2): | |
| scores = [] | |
| if 'normalized_score' in results.get('multilingual', {}): | |
| scores.append(results['multilingual']['normalized_score'] * w_multi) | |
| if 'normalized_score' in results.get('bert', {}): | |
| scores.append(results['bert']['normalized_score'] * w_bert) | |
| if 'normalized_score' in results.get('lexico', {}): | |
| scores.append(results['lexico']['normalized_score'] * w_lex) | |
| if 'normalized_score' in results.get('textblob', {}): | |
| scores.append(results['textblob']['normalized_score'] * w_tb) | |
| if not scores: | |
| return "❓ INDETERMINADO", 0.0, "#FB923C" | |
| s = float(np.sum(scores)) | |
| if s > thr: | |
| return "😊 POSITIVO", s, "#10B981" | |
| elif s < -thr: | |
| return "😠 NEGATIVO", s, "#EF4444" | |
| else: | |
| return "😐 NEUTRO", s, "#6B7280" | |
| def detect_lang(text): | |
| try: | |
| return detect(text) | |
| except Exception: | |
| return "unknown" | |
| # ============================== | |
| # Análisis de texto (UI) | |
| # ============================== | |
| def analyze_text(text, w_multi, w_bert, w_lex, w_tb, thr): | |
| text = (text or "").strip() | |
| if not text: | |
| return "❌ Ingresa un texto", "", "", "" | |
| lang = detect_lang(text) | |
| models = model_scores(text) | |
| label, final, color = fuse_scores(models, w_multi, w_bert, w_lex, w_tb, thr) | |
| nlp, _ = load_spacy() | |
| header = f""" | |
| <div style='background:{color}22; border-left:6px solid {color}; padding:16px; border-radius:10px'> | |
| <div style='display:flex; justify-content:space-between; align-items:center'> | |
| <h2 style='margin:0; color:{color}'>{label}</h2> | |
| <code style='opacity:0.8'>Idioma detectado: {lang}</code> | |
| </div> | |
| <p style='margin:4px 0'><b>Puntuación combinada:</b> {final:.3f}</p> | |
| <p style='margin:4px 0'><b>Longitud:</b> {len(text)} caracteres</p> | |
| </div> | |
| """ | |
| # Detalles por modelo | |
| def block(name, d): | |
| if 'error' in d: | |
| return f"<div><b>{name}</b><br><span style='color:#EF4444'>Error: {d['error']}</span></div>" | |
| rows = [] | |
| for k,v in d.items(): | |
| if isinstance(v, float): | |
| rows.append(f"{k}: {v:.3f}") | |
| else: | |
| rows.append(f"{k}: {v}") | |
| return f"<div style='padding:8px; border:1px solid #e5e7eb; border-radius:8px'><b>{name}</b><br>" + "<br>".join(rows) + "</div>" | |
| details = "<h3>📊 Resultados por método</h3>" + "<div style='display:grid; gap:10px; grid-template-columns: repeat(auto-fit,minmax(240px,1fr))'>" + block("Multilingual", models.get('multilingual', {})) + block("BERT (estrellas)", models.get('bert', {})) + block("Léxico (ES)", models.get('lexico', {})) + block("TextBlob", models.get('textblob', {})) + "</div>" | |
| # Resaltado léxico | |
| highlighted = highlight_words(text, nlp) | |
| highlight_html = f""" | |
| <h3>🔎 Palabras clave detectadas</h3> | |
| <div style='padding:12px; border:1px dashed #d1d5db; border-radius:10px'>{highlighted}</div> | |
| """ | |
| # Lingüística resumida | |
| if nlp: | |
| doc = nlp(text) | |
| ents = "<br>".join([f"• {e.text} ({e.label_})" for e in list(doc.ents)[:8]]) or "—" | |
| ling = f""" | |
| <h3>📝 Análisis lingüístico (spaCy)</h3> | |
| <ul> | |
| <li>Tokens: {len(doc)}</li> | |
| <li>Palabras: {len([t for t in doc if t.is_alpha])}</li> | |
| <li>Oraciones: {len(list(doc.sents))}</li> | |
| <li>Entidades: {len(doc.ents)}</li> | |
| </ul> | |
| <p><b>Entidades detectadas:</b><br>{ents}</p> | |
| """ | |
| else: | |
| ling = "<p style='color:#EF4444'>spaCy no disponible (modelo es_core_news_sm no instalado)</p>" | |
| return header, details, highlight_html, ling | |
| # ============================== | |
| # Excel/CSV | |
| # ============================== | |
| def analyze_file(file, max_rows, text_cols_manual, w_multi, w_bert, w_lex, w_tb, thr): | |
| if file is None: | |
| return pd.DataFrame([{"Resultado":"❌ Sube un archivo .xlsx o .csv"}]) | |
| name = getattr(file, "name", "archivo") | |
| try: | |
| if name.lower().endswith(".csv"): | |
| df = pd.read_csv(file) | |
| else: | |
| df = pd.read_excel(file) | |
| except Exception as e: | |
| return pd.DataFrame([{"Error": f"❌ No pude leer el archivo: {e}"}]) | |
| # Detectar columnas de texto si no se especifican | |
| if text_cols_manual: | |
| cols = [c.strip() for c in text_cols_manual.split(",") if c.strip() in df.columns] | |
| else: | |
| cols = [] | |
| for c in df.columns: | |
| if df[c].dtype == "object": | |
| sample = df[c].dropna().astype(str).head(5).tolist() | |
| if any(len(s.split()) >= 5 for s in sample): | |
| cols.append(c) | |
| cols = cols[:2] # máximo 2 columnas por defecto | |
| if not cols: | |
| return pd.DataFrame([{"Resultado":"❌ No encontré columnas de texto (o especifica manualmente)"}]) | |
| records = [] | |
| for c in cols: | |
| for i, text in enumerate(df[c].dropna().astype(str).head(max_rows), start=1): | |
| models = model_scores(text) | |
| label, s, _ = fuse_scores(models, w_multi, w_bert, w_lex, w_tb, thr) | |
| records.append({ | |
| "Columna": c, | |
| "Fila": i, | |
| "Texto": (text[:140] + "...") if len(text) > 140 else text, | |
| "Sentimiento": label.replace("😊 ","").replace("😠 ","").replace("😐 ",""), | |
| "Score": round(s,3), | |
| "Len": len(text) | |
| }) | |
| return pd.DataFrame.from_records(records) | |
| # ============================== | |
| # UI | |
| # ============================== | |
| with gr.Blocks(theme="soft", title=APP_TITLE, css=""" | |
| #component-0 .hover\:bg-red-500:hover{ background: none } | |
| .markdown-body h1, .markdown-body h2 { margin-top:0 } | |
| """) as demo: | |
| gr.Markdown(f""" | |
| # {APP_TITLE} | |
| **Combina múltiples modelos, léxico y análisis lingüístico. Ajusta pesos y genera insights épicos.** | |
| """) | |
| with gr.Tab("📝 Texto individual"): | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| text_in = gr.Textbox(label="Texto", lines=6, placeholder="Escribe aquí en ES/EN/FR/PT...") | |
| with gr.Accordion("⚙️ Pesos y umbral", open=False): | |
| w_multi = gr.Slider(0,1,value=0.4,step=0.05,label="Peso Multilingual") | |
| w_bert = gr.Slider(0,1,value=0.3,step=0.05,label="Peso BERT") | |
| w_lex = gr.Slider(0,1,value=0.2,step=0.05,label="Peso Léxico") | |
| w_tb = gr.Slider(0,1,value=0.1,step=0.05,label="Peso TextBlob") | |
| thr = gr.Slider(0,1,value=0.2,step=0.01,label="Umbral de neutro (|score| ≤ umbral)") | |
| btn = gr.Button("🔍 Analizar", variant="primary") | |
| gr.Examples( | |
| examples=[ | |
| ["Me encanta este producto, superó mis expectativas y lo recomiendo."], | |
| ["Pésimo servicio, llegó tarde y defectuoso. Muy decepcionado."], | |
| ["El producto cumple, pero no destaca. Está bien por el precio."], | |
| ["I absolutely love it! Great quality and fast delivery."], | |
| ["C'est un service horrible, je ne le recommande à personne."], | |
| ["O atendimento foi excelente e o produto é ótimo."] | |
| ], | |
| inputs=[text_in] | |
| ) | |
| with gr.Column(scale=5): | |
| head = gr.HTML(label="🎯 Resultado") | |
| methods = gr.HTML(label="📊 Detalles por modelo") | |
| highlights = gr.HTML(label="🔎 Palabras clave") | |
| ling = gr.HTML(label="📝 Lingüística") | |
| btn.click(analyze_text, [text_in, w_multi, w_bert, w_lex, w_tb, thr], [head, methods, highlights, ling]) | |
| text_in.submit(analyze_text, [text_in, w_multi, w_bert, w_lex, w_tb, thr], [head, methods, highlights, ling]) | |
| with gr.Tab("📈 Lote (Excel/CSV)"): | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| f = gr.File(label="Sube .xlsx o .csv") | |
| max_rows = gr.Slider(5, 500, value=100, step=5, label="Filas máximas por columna") | |
| text_cols_manual = gr.Textbox(label="Columnas de texto (opcional, separadas por coma)") | |
| with gr.Accordion("⚙️ Pesos y umbral", open=False): | |
| w_multi2 = gr.Slider(0,1,value=0.4,step=0.05,label="Peso Multilingual") | |
| w_bert2 = gr.Slider(0,1,value=0.3,step=0.05,label="Peso BERT") | |
| w_lex2 = gr.Slider(0,1,value=0.2,step=0.05,label="Peso Léxico") | |
| w_tb2 = gr.Slider(0,1,value=0.1,step=0.05,label="Peso TextBlob") | |
| thr2 = gr.Slider(0,1,value=0.2,step=0.01,label="Umbral de neutro") | |
| btn2 = gr.Button("🚀 Analizar archivo", variant="primary") | |
| with gr.Column(scale=5): | |
| df_out = gr.Dataframe(wrap=True, label="Resultados") | |
| dl = gr.DownloadButton(label="⬇️ Descargar CSV", value=None) | |
| def _pipe(file, max_rows, text_cols_manual, w1,w2,w3,w4,thr): | |
| df = analyze_file(file, int(max_rows), text_cols_manual, w1,w2,w3,w4,thr) | |
| # generar CSV temporal | |
| try: | |
| csv = df.to_csv(index=False).encode("utf-8") | |
| return df, csv | |
| except Exception: | |
| return df, None | |
| btn2.click(_pipe, | |
| [f, max_rows, text_cols_manual, w_multi2, w_bert2, w_lex2, w_tb2, thr2], | |
| [df_out, dl]) | |
| with gr.Tab("ℹ️ Sistema & Modelos"): | |
| spacy_status = load_spacy()[1] | |
| m1_status = load_multilingual_sentiment()[1] | |
| m2_status = load_multilingual_bert()[1] | |
| gr.Markdown(f""" | |
| ### Estado de modelos | |
| - {spacy_status} | |
| - {m1_status} | |
| - {m2_status} | |
| ### Cómo mejorar precisión | |
| - Ajusta pesos según tu dominio (por ejemplo, más peso al léxico para español coloquial). | |
| - Entrena un diccionario propio con palabras frecuentes de tus clientes. | |
| - Limpia el texto (remueve spam, URLs, firmas) antes de analizar. | |
| - Para grandes volúmenes, considera un modelo fine-tuned con tus datos. | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |