Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pdfplumber | |
| from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
| import torch | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.memory import ConversationBufferMemory | |
| # โหลดโมเดล ThaiBERT จาก Hugging Face | |
| tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") | |
| model = AutoModelForQuestionAnswering.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") | |
| # ฟังก์ชันสำหรับอ่านเนื้อหาจาก PDF | |
| def extract_text_from_pdf(pdf_file): | |
| with pdfplumber.open(pdf_file) as pdf: | |
| text = "" | |
| for page in pdf.pages: | |
| text += page.extract_text() | |
| return text | |
| # ฟังก์ชันสำหรับการตอบคำถามด้วย ThaiBERT | |
| def answer_question(question, context): | |
| inputs = tokenizer.encode_plus(question, context, return_tensors="pt") | |
| answer_start_scores, answer_end_scores = model(**inputs) | |
| answer_start = torch.argmax(answer_start_scores.start_logits) | |
| answer_end = torch.argmax(answer_end_scores.end_logits) + 1 | |
| answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])) | |
| return answer | |
| # ตั้งค่าอินเตอร์เฟสของหน้าเว็บด้วย Streamlit | |
| st.title("ThaiBERT PDF QA System") | |
| uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
| if uploaded_file: | |
| # อ่านเนื้อหาจาก PDF | |
| pdf_text = extract_text_from_pdf(uploaded_file) | |
| # สร้าง chain สำหรับถามตอบ | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| docs = text_splitter.create_documents([pdf_text]) | |
| # สร้าง embeddings โดยใช้ transformers | |
| model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1" | |
| embedding_model = AutoModel.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # ปรับแต่ง Chroma กับ embeddings ของคุณ | |
| vector_store = Chroma.from_documents(documents=docs, embedding=embedding_model) | |
| retriever = vector_store.as_retriever() | |
| memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
| qa_chain = ConversationalRetrievalChain( | |
| retriever=retriever, | |
| llm=None, # ลบ HuggingFaceHub เพราะไม่ได้ใช้งาน | |
| memory=memory | |
| ) | |
| # หน้าต่างสำหรับใส่คำถาม | |
| user_question = st.text_input("Ask a question about the PDF content") | |
| if user_question: | |
| response = qa_chain.run(user_question) | |
| st.write("Answer:", response) | |