File size: 4,161 Bytes
637e09e
 
e1b785a
 
 
 
 
 
2f923a7
 
 
a49fb2e
e1b785a
 
 
 
2f923a7
 
 
 
 
 
a49fb2e
 
e1b785a
a49fb2e
 
 
e1b785a
 
0d61d36
 
 
 
 
e1b785a
 
2f923a7
 
542ed91
 
 
 
 
 
 
e1b785a
2f923a7
 
e1b785a
2f923a7
 
 
 
542ed91
 
e1b785a
 
 
 
 
 
 
2f923a7
e1b785a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f923a7
98888b2
2f923a7
 
a49fb2e
 
 
 
 
 
 
 
786a90b
9b43b25
8a7f3b7
 
 
a49fb2e
9b43b25
542ed91
2f923a7
a49fb2e
 
 
 
907577d
e1b785a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Modified RAG Pipeline for General Document Q&A (Khmer & English)

import os
import logging
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFDirectoryLoader
from openai import OpenAI

logging.basicConfig(level=logging.INFO)

use_gpu = torch.cuda.is_available()

if use_gpu:
    print("CUDA device in use:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU. No GPU detected.")

# Load API key from HF Space secrets
SEALION_API_KEY = os.environ.get("SEALION_API_KEY")

client = OpenAI(
    api_key=SEALION_API_KEY,
    base_url="https://api.sea-lion.ai/v1"
)

# Use Hugging Face's writable directory
WRITABLE_DIR = os.environ.get("HOME", "/app")

DATA_PATH = os.path.join(WRITABLE_DIR, "src", "data")
CHROMA_PATH = os.path.join(WRITABLE_DIR, "src", "chroma")
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")

# Generic assistant prompt for dual Khmer/English
PROMPT_TEMPLATE = """
Respond ONLY in the same language as the question. 
If the question is in English, answer in English. 
If the question is in Khmer, answer in Khmer.

You are a helpful assistant. 
Use only the provided context below to answer the question. 
Do not mention the context or that you used it.

Context:
{context}

Question:
{question}

Answer:
"""


def load_documents():
    loader = PyPDFDirectoryLoader(DATA_PATH)
    return loader.load()

def split_text(documents: list[Document]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512, chunk_overlap=100, length_function=len, add_start_index=True
    )
    chunks = splitter.split_documents(documents)
    logging.info(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_model)
        db.add_documents(chunks)
        logging.info("Added documents to existing Chroma DB.")
    else:
        db = Chroma.from_documents(
            chunks, embedding_model, persist_directory=CHROMA_PATH
        )
        logging.info("Created new Chroma DB.")
    db.persist()
    logging.info(f"Saved {len(chunks)} chunks to Chroma.")

def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

def ask_question(query_text: str, k: int = 3):
    logging.info("Processing user question...")

    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_model)
    results = db.similarity_search(query_text, k=k)

    context_chunks = []
    for doc in results:
        meta = doc.metadata or {}
        context_chunks.append({
            "filename": os.path.basename(meta.get("source", "unknown.pdf")),
            "page": meta.get("page", 1),
            "text": doc.page_content.strip()
        })

    context_text = "\n\n".join(chunk["text"] for chunk in context_chunks)
    prompt = PROMPT_TEMPLATE.format(context=context_text, question=query_text)

    messages = [{"role": "user", "content": prompt}]
    logging.info("Sending prompt to model...")
    
    try:
        logging.info("Sending prompt to SEA-LION API...")
        completion = client.chat.completions.create(
            model="aisingapore/Llama-SEA-LION-v3.5-8B-R",
            messages=messages,
            extra_body={
                "chat_template_kwargs": {
                    "thinking_mode": "off"
                },
                "cache": {
                    "no-cache": True
                }
            },
            
            max_tokens=512
        )
        answer = completion.choices[0].message.content.strip()
    except Exception as e:
        logging.error(f"Error calling SEA-LION API: {e}")
        answer = "Sorry, something went wrong when contacting the language model."

    return answer, context_chunks