Spaces:
Sleeping
Sleeping
| import os, glob, re | |
| import gradio as gr | |
| from docx import Document | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # --- QA (transformers) --- | |
| from transformers import pipeline | |
| # ------------------ Config ------------------ | |
| DOCS_DIR = "." # .docx en la raíz del Space | |
| CHUNK_SIZE = 900 # longitud del fragmento (caracteres) | |
| OVERLAP = 150 # solapamiento | |
| TOP_K_RETRIEVE = 5 # fragmentos candidatos para QA | |
| TOP_K_SHOW = 3 # fragmentos a mostrar en modo "fragmentos" | |
| QA_MODEL = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es" | |
| QA_THRESHOLD = 0.25 # umbral mínimo de confianza del modelo QA | |
| # Stopwords (lista breve en español) | |
| SPANISH_STOPWORDS = [ | |
| "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con", | |
| "no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este", | |
| "ha","sí","porque","esta","son","entre","cuando","muy","sin","sobre","también", | |
| "me","hasta","hay","donde","quien","desde","todo","nos","durante","todos","uno", | |
| "les","ni","contra","otros","ese","eso","ante","ellos","e","esto","mí","antes", | |
| "algunos","qué","unos","yo","otro","otras","otra","él","tanto","esa","estos", | |
| "mucho","quienes","nada","muchos","cual","poco","ella","estar","estas","algunas", | |
| "algo","nosotros","mi","mis","tú","te","ti","tu","tus","ellas","nosotras","vosotros", | |
| "vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo", | |
| "suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro", | |
| "vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos", | |
| "estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran" | |
| ] | |
| # ------------------ Utilidades ------------------ | |
| def _read_docx(path: str) -> str: | |
| doc = Document(path) | |
| parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()] | |
| return "\n".join(parts) | |
| def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP): | |
| text = re.sub(r"\s+", " ", text).strip() | |
| if not text: | |
| return [] | |
| chunks, i = [], 0 | |
| step = max(1, size - overlap) | |
| while i < len(text): | |
| chunks.append(text[i:i+size]) | |
| i += step | |
| return chunks | |
| # ------------------ Indexación ------------------ | |
| corpus, sources = [], [] | |
| indexed_files, skipped_files = [], [] | |
| def build_index(): | |
| global corpus, sources, indexed_files, skipped_files, vectorizer, X | |
| corpus, sources = [], [] | |
| indexed_files, skipped_files = [], [] | |
| for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx"))): | |
| try: | |
| txt = _read_docx(path) | |
| chs = _chunk(txt) | |
| if chs: | |
| corpus.extend(chs) | |
| sources.extend([path] * len(chs)) | |
| indexed_files.append(os.path.basename(path)) | |
| else: | |
| skipped_files.append((os.path.basename(path), "Sin texto utilizable")) | |
| except Exception as e: | |
| skipped_files.append((os.path.basename(path), f"Error al leer: {e}")) | |
| if not corpus: | |
| corpus[:] = ["(No hay texto indexado: agregá .docx con contenido)"] | |
| sources[:] = [""] | |
| vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True) | |
| X = vectorizer.fit_transform(corpus) | |
| build_index() | |
| # ------------------ QA ------------------ | |
| qa = pipeline("question-answering", model=QA_MODEL) | |
| def answer_qa(question: str): | |
| """Corre QA sobre los TOP_K_RETRIEVE fragmentos y devuelve mejor respuesta.""" | |
| q = vectorizer.transform([question]) | |
| sims = cosine_similarity(q, X).ravel() | |
| top_idx = sims.argsort()[::-1][:TOP_K_RETRIEVE] | |
| best = None | |
| for i in top_idx: | |
| context = corpus[i] | |
| res = qa(question=question, context=context) | |
| # res: {'score': float, 'start': int, 'end': int, 'answer': text} | |
| candidate = { | |
| "text": res.get("answer", "").strip(), | |
| "score": float(res.get("score", 0.0)), | |
| "source": os.path.basename(sources[i]), | |
| "context": context | |
| } | |
| if not best or candidate["score"] > best["score"]: | |
| best = candidate | |
| return best | |
| # ------------------ Funciones UI ------------------ | |
| def chat_fn(message, history, modo_qa): | |
| if "(No hay texto indexado" in corpus[0]: | |
| return "No hay texto indexado aún. Verificá que los .docx tengan contenido." | |
| if modo_qa: | |
| best = answer_qa(message) | |
| if best and best["text"] and best["score"] >= QA_THRESHOLD: | |
| return f"**Respuesta:** {best['text']}\n\n**Fuente:** {best['source']} \n*(confianza: {best['score']:.2f})*" | |
| else: | |
| # fallback a fragmentos cuando la confianza es baja | |
| q = vectorizer.transform([message]) | |
| sims = cosine_similarity(q, X).ravel() | |
| top_idx = sims.argsort()[::-1][:TOP_K_SHOW] | |
| bullets = [] | |
| for i in top_idx: | |
| frag = corpus[i] | |
| src = os.path.basename(sources[i]) | |
| bullets.append(f"**{src}** · …{frag[:420]}…") | |
| return ( | |
| "No puedo responder con suficiente confianza. Te dejo los fragmentos más cercanos:\n\n- " | |
| + "\n- ".join(bullets) | |
| ) | |
| else: | |
| # modo fragmentos (como ahora) | |
| q = vectorizer.transform([message]) | |
| sims = cosine_similarity(q, X).ravel() | |
| top_idx = sims.argsort()[::-1][:TOP_K_SHOW] | |
| bullets = [] | |
| for i in top_idx: | |
| frag = corpus[i] | |
| src = os.path.basename(sources[i]) | |
| bullets.append(f"**{src}** · …{frag[:420]}…") | |
| return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets) | |
| def status_fn(): | |
| lines = [] | |
| if indexed_files: | |
| lines.append("**Archivos indexados:**") | |
| for f in indexed_files: | |
| lines.append(f"- " + f) | |
| if skipped_files: | |
| lines.append("\n**Archivos saltados:**") | |
| for f, why in skipped_files: | |
| lines.append(f"- {f}: {why}") | |
| if not lines: | |
| lines.append("No se encontró ningún .docx en el directorio.") | |
| return "\n".join(lines) | |
| # ------------------ Interfaz Gradio ------------------ | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Chat de documentos (DOCX) — con respuesta natural (QA)") | |
| gr.Markdown( | |
| "Activá **Respuesta natural (QA)** para que el sistema intente contestar en español " | |
| "a partir del fragmento más relevante; si la confianza es baja, mostrará fragmentos." | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Chat"): | |
| modo_qa = gr.Checkbox(label="Respuesta natural (QA)", value=True) | |
| chat = gr.ChatInterface( | |
| fn=lambda msg, hist: chat_fn(msg, hist, modo_qa.value), | |
| title=None, description=None | |
| ) | |
| # Vincular el checkbox al chat (simple workaround) | |
| modo_qa.change(fn=lambda x: None, inputs=modo_qa, outputs=[]) | |
| with gr.Tab("Estado"): | |
| btn = gr.Button("Actualizar estado") | |
| out = gr.Markdown(status_fn()) | |
| btn.click(fn=lambda: status_fn(), outputs=out) | |
| if __name__ == "__main__": | |
| demo.launch() | |