Spaces:
Sleeping
Sleeping
File size: 4,161 Bytes
637e09e e1b785a 2f923a7 a49fb2e e1b785a 2f923a7 a49fb2e e1b785a a49fb2e e1b785a 0d61d36 e1b785a 2f923a7 542ed91 e1b785a 2f923a7 e1b785a 2f923a7 542ed91 e1b785a 2f923a7 e1b785a 2f923a7 98888b2 2f923a7 a49fb2e 786a90b 9b43b25 8a7f3b7 a49fb2e 9b43b25 542ed91 2f923a7 a49fb2e 907577d e1b785a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# Modified RAG Pipeline for General Document Q&A (Khmer & English)
import os
import logging
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from openai import OpenAI
logging.basicConfig(level=logging.INFO)
use_gpu = torch.cuda.is_available()
if use_gpu:
print("CUDA device in use:", torch.cuda.get_device_name(0))
else:
print("Running on CPU. No GPU detected.")
# Load API key from HF Space secrets
SEALION_API_KEY = os.environ.get("SEALION_API_KEY")
client = OpenAI(
api_key=SEALION_API_KEY,
base_url="https://api.sea-lion.ai/v1"
)
# Use Hugging Face's writable directory
WRITABLE_DIR = os.environ.get("HOME", "/app")
DATA_PATH = os.path.join(WRITABLE_DIR, "src", "data")
CHROMA_PATH = os.path.join(WRITABLE_DIR, "src", "chroma")
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
# Generic assistant prompt for dual Khmer/English
PROMPT_TEMPLATE = """
Respond ONLY in the same language as the question.
If the question is in English, answer in English.
If the question is in Khmer, answer in Khmer.
You are a helpful assistant.
Use only the provided context below to answer the question.
Do not mention the context or that you used it.
Context:
{context}
Question:
{question}
Answer:
"""
def load_documents():
loader = PyPDFDirectoryLoader(DATA_PATH)
return loader.load()
def split_text(documents: list[Document]):
splitter = RecursiveCharacterTextSplitter(
chunk_size=512, chunk_overlap=100, length_function=len, add_start_index=True
)
chunks = splitter.split_documents(documents)
logging.info(f"Split {len(documents)} documents into {len(chunks)} chunks.")
return chunks
def save_to_chroma(chunks: list[Document]):
if os.path.exists(CHROMA_PATH):
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_model)
db.add_documents(chunks)
logging.info("Added documents to existing Chroma DB.")
else:
db = Chroma.from_documents(
chunks, embedding_model, persist_directory=CHROMA_PATH
)
logging.info("Created new Chroma DB.")
db.persist()
logging.info(f"Saved {len(chunks)} chunks to Chroma.")
def generate_data_store():
documents = load_documents()
chunks = split_text(documents)
save_to_chroma(chunks)
def ask_question(query_text: str, k: int = 3):
logging.info("Processing user question...")
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_model)
results = db.similarity_search(query_text, k=k)
context_chunks = []
for doc in results:
meta = doc.metadata or {}
context_chunks.append({
"filename": os.path.basename(meta.get("source", "unknown.pdf")),
"page": meta.get("page", 1),
"text": doc.page_content.strip()
})
context_text = "\n\n".join(chunk["text"] for chunk in context_chunks)
prompt = PROMPT_TEMPLATE.format(context=context_text, question=query_text)
messages = [{"role": "user", "content": prompt}]
logging.info("Sending prompt to model...")
try:
logging.info("Sending prompt to SEA-LION API...")
completion = client.chat.completions.create(
model="aisingapore/Llama-SEA-LION-v3.5-8B-R",
messages=messages,
extra_body={
"chat_template_kwargs": {
"thinking_mode": "off"
},
"cache": {
"no-cache": True
}
},
max_tokens=512
)
answer = completion.choices[0].message.content.strip()
except Exception as e:
logging.error(f"Error calling SEA-LION API: {e}")
answer = "Sorry, something went wrong when contacting the language model."
return answer, context_chunks
|