Spaces:

alegio98
/

rag-for-insiel

Sleeping

App Files Files Community

rag-for-insiel / rag_ingest.py

alegio98

fix con file ingest

f56e52a 7 months ago

raw

history blame contribute delete

5.17 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	get_ipython().system('pip install docling chromadb sentence-transformers')


	# In[2]:


	get_ipython().system('pip install pymupdf tqdm spacy')
	get_ipython().system('python -m spacy download it_core_news_sm')


	# In[3]:


	get_ipython().system('pip install transformers')


	# In[4]:


	import fitz # PyMuPDF
	from tqdm.auto import tqdm
	import pandas as pd

	def text_formatter(text: str) -> str:
	# Pulizia semplice
	import re
	text = text.replace("\n", " ").strip()
	text = re.sub(r"[ \t]{2,}", " ", text)
	text = re.sub(r"\.{2,}", " ", text) # sostituisce ... con spazio
	text = re.sub(r"Pagina\s+\d+\s+di\s+\d+", "", text, flags=re.IGNORECASE)
	text = re.sub(r"Creazione VM su Cloud INSIEL","", text)
	text = re.sub(r"IO_XX_00_XX ISTRUZIONE OPERATIVA 22/10/2024", "",text)
	text = re.sub(r"IO_XX_00_XX ISTRUZIONE OPERATIVA 22/10/2024 ","",text)
	return text.strip()

	def open_and_read_pdf(pdf_path: str):
	doc = fitz.open(pdf_path)
	pages = []
	for page_number, page in tqdm(enumerate(doc), total=len(doc), desc="📄 Lettura pagine PDF"):
	text = text_formatter(page.get_text())
	pages.append({
	"page_number": page_number + 1,
	"page_char_count": len(text),
	"page_word_count": len(text.split()),
	"page_token_estimate": len(text) // 4,
	"text": text
	})
	return pages

	pdf_path = "data/insiel.pdf" # Cambia se il tuo file è altrove
	pages_and_texts = open_and_read_pdf(pdf_path)


	# In[5]:


	import spacy
	nlp = spacy.load("it_core_news_sm")

	# Spezza il testo di ogni pagina in frasi
	for page in tqdm(pages_and_texts, desc="✂️ Split in frasi"):
	doc = nlp(page["text"])
	sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
	page["sentence_chunks"] = []

	CHUNK_SIZE = 10 # Gruppi da 5 frasi
	for i in range(0, len(sentences), CHUNK_SIZE):
	chunk = sentences[i:i + CHUNK_SIZE]
	page["sentence_chunks"].append(chunk)


	# In[6]:


	pages_and_texts[65]


	# In[7]:


	df = pd.DataFrame(pages_and_texts)
	df["chunk_id"] = df.index.astype(str)

	# Mostra i primi
	df.tail()


	# In[8]:


	df.shape


	# In[9]:


	df[df['page_token_estimate'] < 60].count()


	# In[10]:


	final = df[df['page_token_estimate'] > 60]


	# In[11]:


	final.describe().round(2)


	# In[12]:


	get_ipython().system('pip install sentence-transformers chromadb')


	# In[13]:


	from sentence_transformers import SentenceTransformer
	from tqdm.notebook import tqdm

	embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1")

	texts = final["text"].tolist()
	chunk_ids = final["chunk_id"].tolist()
	metadatas = [{"page": int(p)} for p in final["page_number"]]

	embeddings = embedding_model.encode(texts, show_progress_bar=True)


	# In[14]:


	import chromadb

	# nuovo client
	client = chromadb.PersistentClient(path="./vectorstore")

	# collection
	collection = client.get_or_create_collection("insiel_chunks")

	# aggiunta
	collection.add(
	documents=texts,
	embeddings=embeddings.tolist(),
	metadatas=metadatas,
	ids=chunk_ids
	)


	# In[16]:


	"""query = input("Domanda: ")
	query_embedding = embedding_model.encode([query])
	"""
	results = collection.query(
	query_embeddings=query_embedding,
	n_results=3 # puoi aumentare a 5, 10, ecc.
	)


	# In[17]:


	"""for i, (doc, meta) in enumerate(zip(results["documents"][0], results["metadatas"][0])):
	print(f"\n🔹 RISULTATO {i+1} (pagina {meta['page']}):")
	print(doc[:500] + "...\n---")
	"""


	# In[18]:


	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id).to(torch.device("cpu"))

	rag_chat = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200, device=-1)


	# In[ ]:


	def generate_rag_response_local(query, retrieved_chunks):
	context = "\n\n".join(retrieved_chunks)

	prompt = f"""[INST] Usa solo le informazioni fornite nel contesto qui sotto per rispondere alla domanda, la risposta deve finire sempre con un punto.
	Se la risposta non è presente, di' chiaramente che non è specificato nel documento.

	Contesto:
	{context}

	Domanda: {query}
	Risposta: [/INST]
	"""
	result = rag_chat(prompt)[0]["generated_text"]
	return result.split("Risposta:")[-1].strip()


	# In[ ]:


	# 🧠 Inserisci la domanda
	query = input("Domanda: ")

	# 🔎 Ottieni l'embedding della query (usa sentence-transformers, NON il modello generativo!)
	query_embedding = embedding_model.encode([query])

	# 🔍 Retrieval dei chunk più simili da Chroma
	results = collection.query(
	query_embeddings=query_embedding,
	n_results=3
	)

	# 🧱 Estrai i chunk di contesto
	retrieved_chunks = results["documents"][0]

	# 🤖 Genera la risposta usando il modello open-source locale
	response = generate_rag_response_local(query, retrieved_chunks)

	# 🖨️ Mostra la risposta
	print("🤖 Risposta:\n", response)


	# In[ ]:


	retrieved_chunks


	# In[ ]:


	results


	# In[ ]: