Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| get_ipython().system('pip install docling chromadb sentence-transformers') | |
| # In[2]: | |
| get_ipython().system('pip install pymupdf tqdm spacy') | |
| get_ipython().system('python -m spacy download it_core_news_sm') | |
| # In[3]: | |
| get_ipython().system('pip install transformers') | |
| # In[4]: | |
| import fitz # PyMuPDF | |
| from tqdm.auto import tqdm | |
| import pandas as pd | |
| def text_formatter(text: str) -> str: | |
| # Pulizia semplice | |
| import re | |
| text = text.replace("\n", " ").strip() | |
| text = re.sub(r"[ \t]{2,}", " ", text) | |
| text = re.sub(r"\.{2,}", " ", text) # sostituisce ... con spazio | |
| text = re.sub(r"Pagina\s+\d+\s+di\s+\d+", "", text, flags=re.IGNORECASE) | |
| text = re.sub(r"Creazione VM su Cloud INSIEL","", text) | |
| text = re.sub(r"IO_XX_00_XX ISTRUZIONE OPERATIVA 22/10/2024", "",text) | |
| text = re.sub(r"IO_XX_00_XX ISTRUZIONE OPERATIVA 22/10/2024 ","",text) | |
| return text.strip() | |
| def open_and_read_pdf(pdf_path: str): | |
| doc = fitz.open(pdf_path) | |
| pages = [] | |
| for page_number, page in tqdm(enumerate(doc), total=len(doc), desc="📄 Lettura pagine PDF"): | |
| text = text_formatter(page.get_text()) | |
| pages.append({ | |
| "page_number": page_number + 1, | |
| "page_char_count": len(text), | |
| "page_word_count": len(text.split()), | |
| "page_token_estimate": len(text) // 4, | |
| "text": text | |
| }) | |
| return pages | |
| pdf_path = "data/insiel.pdf" # Cambia se il tuo file è altrove | |
| pages_and_texts = open_and_read_pdf(pdf_path) | |
| # In[5]: | |
| import spacy | |
| nlp = spacy.load("it_core_news_sm") | |
| # Spezza il testo di ogni pagina in frasi | |
| for page in tqdm(pages_and_texts, desc="✂️ Split in frasi"): | |
| doc = nlp(page["text"]) | |
| sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()] | |
| page["sentence_chunks"] = [] | |
| CHUNK_SIZE = 10 # Gruppi da 5 frasi | |
| for i in range(0, len(sentences), CHUNK_SIZE): | |
| chunk = sentences[i:i + CHUNK_SIZE] | |
| page["sentence_chunks"].append(chunk) | |
| # In[6]: | |
| pages_and_texts[65] | |
| # In[7]: | |
| df = pd.DataFrame(pages_and_texts) | |
| df["chunk_id"] = df.index.astype(str) | |
| # Mostra i primi | |
| df.tail() | |
| # In[8]: | |
| df.shape | |
| # In[9]: | |
| df[df['page_token_estimate'] < 60].count() | |
| # In[10]: | |
| final = df[df['page_token_estimate'] > 60] | |
| # In[11]: | |
| final.describe().round(2) | |
| # In[12]: | |
| get_ipython().system('pip install sentence-transformers chromadb') | |
| # In[13]: | |
| from sentence_transformers import SentenceTransformer | |
| from tqdm.notebook import tqdm | |
| embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1") | |
| texts = final["text"].tolist() | |
| chunk_ids = final["chunk_id"].tolist() | |
| metadatas = [{"page": int(p)} for p in final["page_number"]] | |
| embeddings = embedding_model.encode(texts, show_progress_bar=True) | |
| # In[14]: | |
| import chromadb | |
| # nuovo client | |
| client = chromadb.PersistentClient(path="./vectorstore") | |
| # collection | |
| collection = client.get_or_create_collection("insiel_chunks") | |
| # aggiunta | |
| collection.add( | |
| documents=texts, | |
| embeddings=embeddings.tolist(), | |
| metadatas=metadatas, | |
| ids=chunk_ids | |
| ) | |
| # In[16]: | |
| """query = input("Domanda: ") | |
| query_embedding = embedding_model.encode([query]) | |
| """ | |
| results = collection.query( | |
| query_embeddings=query_embedding, | |
| n_results=3 # puoi aumentare a 5, 10, ecc. | |
| ) | |
| # In[17]: | |
| """for i, (doc, meta) in enumerate(zip(results["documents"][0], results["metadatas"][0])): | |
| print(f"\n🔹 RISULTATO {i+1} (pagina {meta['page']}):") | |
| print(doc[:500] + "...\n---") | |
| """ | |
| # In[18]: | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id).to(torch.device("cpu")) | |
| rag_chat = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200, device=-1) | |
| # In[ ]: | |
| def generate_rag_response_local(query, retrieved_chunks): | |
| context = "\n\n".join(retrieved_chunks) | |
| prompt = f"""[INST] Usa solo le informazioni fornite nel contesto qui sotto per rispondere alla domanda, la risposta deve finire sempre con un punto. | |
| Se la risposta non è presente, di' chiaramente che non è specificato nel documento. | |
| Contesto: | |
| {context} | |
| Domanda: {query} | |
| Risposta: [/INST] | |
| """ | |
| result = rag_chat(prompt)[0]["generated_text"] | |
| return result.split("Risposta:")[-1].strip() | |
| # In[ ]: | |
| # 🧠 Inserisci la domanda | |
| query = input("Domanda: ") | |
| # 🔎 Ottieni l'embedding della query (usa sentence-transformers, NON il modello generativo!) | |
| query_embedding = embedding_model.encode([query]) | |
| # 🔍 Retrieval dei chunk più simili da Chroma | |
| results = collection.query( | |
| query_embeddings=query_embedding, | |
| n_results=3 | |
| ) | |
| # 🧱 Estrai i chunk di contesto | |
| retrieved_chunks = results["documents"][0] | |
| # 🤖 Genera la risposta usando il modello open-source locale | |
| response = generate_rag_response_local(query, retrieved_chunks) | |
| # 🖨️ Mostra la risposta | |
| print("🤖 Risposta:\n", response) | |
| # In[ ]: | |
| retrieved_chunks | |
| # In[ ]: | |
| results | |
| # In[ ]: | |