Spaces:

SurFuturo
/

plataformas

Sleeping

App Files Files Community

plataformas / app.py

SurFuturo

Update app.py

d727f97 verified 2 months ago

raw

history blame contribute delete

7.29 kB

	import os, glob, re
	import gradio as gr
	from docx import Document
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	# --- QA (transformers) ---
	from transformers import pipeline

	# ------------------ Config ------------------
	DOCS_DIR = "." # .docx en la raíz del Space
	CHUNK_SIZE = 900 # longitud del fragmento (caracteres)
	OVERLAP = 150 # solapamiento
	TOP_K_RETRIEVE = 5 # fragmentos candidatos para QA
	TOP_K_SHOW = 3 # fragmentos a mostrar en modo "fragmentos"
	QA_MODEL = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
	QA_THRESHOLD = 0.25 # umbral mínimo de confianza del modelo QA

	# Stopwords (lista breve en español)
	SPANISH_STOPWORDS = [
	"de","la","que","el","en","y","a","los","del","se","las","por","un","para","con",
	"no","una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este",
	"ha","sí","porque","esta","son","entre","cuando","muy","sin","sobre","también",
	"me","hasta","hay","donde","quien","desde","todo","nos","durante","todos","uno",
	"les","ni","contra","otros","ese","eso","ante","ellos","e","esto","mí","antes",
	"algunos","qué","unos","yo","otro","otras","otra","él","tanto","esa","estos",
	"mucho","quienes","nada","muchos","cual","poco","ella","estar","estas","algunas",
	"algo","nosotros","mi","mis","tú","te","ti","tu","tus","ellas","nosotras","vosotros",
	"vosotras","os","mío","mía","míos","mías","tuyo","tuya","tuyos","tuyas","suyo",
	"suya","suyos","suyas","nuestro","nuestra","nuestros","nuestras","vuestro",
	"vuestra","vuestros","vuestras","esos","esas","estoy","estás","está","estamos",
	"estáis","están","ser","soy","eres","somos","sois","era","eras","éramos","erais","eran"
	]

	# ------------------ Utilidades ------------------
	def _read_docx(path: str) -> str:
	doc = Document(path)
	parts = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
	return "\n".join(parts)

	def _chunk(text: str, size: int = CHUNK_SIZE, overlap: int = OVERLAP):
	text = re.sub(r"\s+", " ", text).strip()
	if not text:
	return []
	chunks, i = [], 0
	step = max(1, size - overlap)
	while i < len(text):
	chunks.append(text[i:i+size])
	i += step
	return chunks

	# ------------------ Indexación ------------------
	corpus, sources = [], []
	indexed_files, skipped_files = [], []

	def build_index():
	global corpus, sources, indexed_files, skipped_files, vectorizer, X
	corpus, sources = [], []
	indexed_files, skipped_files = [], []

	for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*.docx"))):
	try:
	txt = _read_docx(path)
	chs = _chunk(txt)
	if chs:
	corpus.extend(chs)
	sources.extend([path] * len(chs))
	indexed_files.append(os.path.basename(path))
	else:
	skipped_files.append((os.path.basename(path), "Sin texto utilizable"))
	except Exception as e:
	skipped_files.append((os.path.basename(path), f"Error al leer: {e}"))

	if not corpus:
	corpus[:] = ["(No hay texto indexado: agregá .docx con contenido)"]
	sources[:] = [""]

	vectorizer = TfidfVectorizer(stop_words=SPANISH_STOPWORDS, lowercase=True)
	X = vectorizer.fit_transform(corpus)

	build_index()

	# ------------------ QA ------------------
	qa = pipeline("question-answering", model=QA_MODEL)

	def answer_qa(question: str):
	"""Corre QA sobre los TOP_K_RETRIEVE fragmentos y devuelve mejor respuesta."""
	q = vectorizer.transform([question])
	sims = cosine_similarity(q, X).ravel()
	top_idx = sims.argsort()[::-1][:TOP_K_RETRIEVE]

	best = None
	for i in top_idx:
	context = corpus[i]
	res = qa(question=question, context=context)
	# res: {'score': float, 'start': int, 'end': int, 'answer': text}
	candidate = {
	"text": res.get("answer", "").strip(),
	"score": float(res.get("score", 0.0)),
	"source": os.path.basename(sources[i]),
	"context": context
	}
	if not best or candidate["score"] > best["score"]:
	best = candidate

	return best

	# ------------------ Funciones UI ------------------
	def chat_fn(message, history, modo_qa):
	if "(No hay texto indexado" in corpus[0]:
	return "No hay texto indexado aún. Verificá que los .docx tengan contenido."

	if modo_qa:
	best = answer_qa(message)
	if best and best["text"] and best["score"] >= QA_THRESHOLD:
	return f"Respuesta: {best['text']}\n\nFuente: {best['source']} \n(confianza: {best['score']:.2f})"
	else:
	# fallback a fragmentos cuando la confianza es baja
	q = vectorizer.transform([message])
	sims = cosine_similarity(q, X).ravel()
	top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
	bullets = []
	for i in top_idx:
	frag = corpus[i]
	src = os.path.basename(sources[i])
	bullets.append(f"{src} · …{frag[:420]}…")
	return (
	"No puedo responder con suficiente confianza. Te dejo los fragmentos más cercanos:\n\n- "
	+ "\n- ".join(bullets)
	)
	else:
	# modo fragmentos (como ahora)
	q = vectorizer.transform([message])
	sims = cosine_similarity(q, X).ravel()
	top_idx = sims.argsort()[::-1][:TOP_K_SHOW]
	bullets = []
	for i in top_idx:
	frag = corpus[i]
	src = os.path.basename(sources[i])
	bullets.append(f"{src} · …{frag[:420]}…")
	return "Fragmentos relevantes:\n\n- " + "\n- ".join(bullets)

	def status_fn():
	lines = []
	if indexed_files:
	lines.append("Archivos indexados:")
	for f in indexed_files:
	lines.append(f"- " + f)
	if skipped_files:
	lines.append("\nArchivos saltados:")
	for f, why in skipped_files:
	lines.append(f"- {f}: {why}")
	if not lines:
	lines.append("No se encontró ningún .docx en el directorio.")
	return "\n".join(lines)

	# ------------------ Interfaz Gradio ------------------
	with gr.Blocks() as demo:
	gr.Markdown("## Chat de documentos (DOCX) — con respuesta natural (QA)")
	gr.Markdown(
	"Activá Respuesta natural (QA) para que el sistema intente contestar en español "
	"a partir del fragmento más relevante; si la confianza es baja, mostrará fragmentos."
	)

	with gr.Tabs():
	with gr.Tab("Chat"):
	modo_qa = gr.Checkbox(label="Respuesta natural (QA)", value=True)
	chat = gr.ChatInterface(
	fn=lambda msg, hist: chat_fn(msg, hist, modo_qa.value),
	title=None, description=None
	)
	# Vincular el checkbox al chat (simple workaround)
	modo_qa.change(fn=lambda x: None, inputs=modo_qa, outputs=[])

	with gr.Tab("Estado"):
	btn = gr.Button("Actualizar estado")
	out = gr.Markdown(status_fn())
	btn.click(fn=lambda: status_fn(), outputs=out)

	if __name__ == "__main__":
	demo.launch()