# Modified RAG Pipeline for General Document Q&A (Khmer & English) import os import logging import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document from langchain.vectorstores.chroma import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.document_loaders import PyPDFDirectoryLoader from openai import OpenAI logging.basicConfig(level=logging.INFO) use_gpu = torch.cuda.is_available() if use_gpu: print("CUDA device in use:", torch.cuda.get_device_name(0)) else: print("Running on CPU. No GPU detected.") # Load API key from HF Space secrets SEALION_API_KEY = os.environ.get("SEALION_API_KEY") client = OpenAI( api_key=SEALION_API_KEY, base_url="https://api.sea-lion.ai/v1" ) # Use Hugging Face's writable directory WRITABLE_DIR = os.environ.get("HOME", "/app") DATA_PATH = os.path.join(WRITABLE_DIR, "src", "data") CHROMA_PATH = os.path.join(WRITABLE_DIR, "src", "chroma") embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base") # Generic assistant prompt for dual Khmer/English PROMPT_TEMPLATE = """ Respond ONLY in the same language as the question. If the question is in English, answer in English. If the question is in Khmer, answer in Khmer. You are a helpful assistant. Use only the provided context below to answer the question. Do not mention the context or that you used it. Context: {context} Question: {question} Answer: """ def load_documents(): loader = PyPDFDirectoryLoader(DATA_PATH) return loader.load() def split_text(documents: list[Document]): splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=100, length_function=len, add_start_index=True ) chunks = splitter.split_documents(documents) logging.info(f"Split {len(documents)} documents into {len(chunks)} chunks.") return chunks def save_to_chroma(chunks: list[Document]): if os.path.exists(CHROMA_PATH): db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_model) db.add_documents(chunks) logging.info("Added documents to existing Chroma DB.") else: db = Chroma.from_documents( chunks, embedding_model, persist_directory=CHROMA_PATH ) logging.info("Created new Chroma DB.") db.persist() logging.info(f"Saved {len(chunks)} chunks to Chroma.") def generate_data_store(): documents = load_documents() chunks = split_text(documents) save_to_chroma(chunks) def ask_question(query_text: str, k: int = 3): logging.info("Processing user question...") db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_model) results = db.similarity_search(query_text, k=k) context_chunks = [] for doc in results: meta = doc.metadata or {} context_chunks.append({ "filename": os.path.basename(meta.get("source", "unknown.pdf")), "page": meta.get("page", 1), "text": doc.page_content.strip() }) context_text = "\n\n".join(chunk["text"] for chunk in context_chunks) prompt = PROMPT_TEMPLATE.format(context=context_text, question=query_text) messages = [{"role": "user", "content": prompt}] logging.info("Sending prompt to model...") try: logging.info("Sending prompt to SEA-LION API...") completion = client.chat.completions.create( model="aisingapore/Llama-SEA-LION-v3.5-8B-R", messages=messages, extra_body={ "chat_template_kwargs": { "thinking_mode": "off" }, "cache": { "no-cache": True } }, max_tokens=512 ) answer = completion.choices[0].message.content.strip() except Exception as e: logging.error(f"Error calling SEA-LION API: {e}") answer = "Sorry, something went wrong when contacting the language model." return answer, context_chunks