Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -56,7 +56,7 @@ repo2 = Repository(
|
|
| 56 |
clone_from="Anne31415/Chat_Store", # Replace with your repository URL
|
| 57 |
token=os.environ["HUB_TOKEN"] # Use the secret token to authenticate
|
| 58 |
)
|
| 59 |
-
|
| 60 |
|
| 61 |
|
| 62 |
# Step 2: Load the PDF File
|
|
@@ -69,16 +69,6 @@ pdf_path3 = "Private_Book/Kosten_Strukturdaten_RAG_vorbereited.pdf"
|
|
| 69 |
api_key = os.getenv("OPENAI_API_KEY")
|
| 70 |
# Retrieve the API key from st.secrets
|
| 71 |
|
| 72 |
-
@st.cache_data
|
| 73 |
-
def extract_text_from_pdf(pdf_path):
|
| 74 |
-
text = ""
|
| 75 |
-
reader = PdfReader(pdf_path)
|
| 76 |
-
for page in reader.pages:
|
| 77 |
-
text += page.extract_text() + " " # Concatenate text from each page
|
| 78 |
-
return text
|
| 79 |
-
|
| 80 |
-
# Use the function to get pdf_text
|
| 81 |
-
pdf_text = extract_text_from_pdf(pdf_path3)
|
| 82 |
|
| 83 |
|
| 84 |
@st.cache_resource
|
|
@@ -126,6 +116,8 @@ def load_vector_store(file_path, store_name, force_reload=False):
|
|
| 126 |
return VectorStore
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
| 129 |
# Utility function to load text from a PDF
|
| 130 |
def load_pdf_text(file_path):
|
| 131 |
pdf_reader = PdfReader(file_path)
|
|
@@ -134,6 +126,22 @@ def load_pdf_text(file_path):
|
|
| 134 |
text += page.extract_text() or "" # Add fallback for pages where text extraction fails
|
| 135 |
return text
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
def load_chatbot():
|
| 138 |
#return load_qa_chain(llm=OpenAI(), chain_type="stuff")
|
| 139 |
return load_qa_chain(llm=OpenAI(model_name="gpt-3.5-turbo-instruct"), chain_type="stuff")
|
|
@@ -245,7 +253,17 @@ def display_session_id():
|
|
| 245 |
session_id = st.session_state['session_id']
|
| 246 |
st.sidebar.markdown(f"**Ihre Session ID:** `{session_id}`")
|
| 247 |
st.sidebar.markdown("Verwenden Sie diese ID als Referenz bei Mitteilungen oder Rückmeldungen.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
def page1():
|
| 251 |
try:
|
|
@@ -489,11 +507,19 @@ def page2():
|
|
| 489 |
|
| 490 |
|
| 491 |
|
|
|
|
|
|
|
| 492 |
def page3():
|
| 493 |
try:
|
| 494 |
# Basic layout setup
|
| 495 |
st.title("Kosten- und Strukturdaten der Krankenhäuser")
|
| 496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
# Initialize CromA client and handle collection
|
| 498 |
chroma_client = chromadb.Client()
|
| 499 |
try:
|
|
@@ -506,10 +532,7 @@ def page3():
|
|
| 506 |
|
| 507 |
# Add documents to the collection if not already done
|
| 508 |
if "documents_added" not in st.session_state:
|
| 509 |
-
collection
|
| 510 |
-
documents=[pdf_text],
|
| 511 |
-
ids=[("Kosten_Strukturdaten0602204")]
|
| 512 |
-
)
|
| 513 |
st.session_state["documents_added"] = True
|
| 514 |
|
| 515 |
# Display chat history
|
|
@@ -522,25 +545,14 @@ def page3():
|
|
| 522 |
full_query = ask_bot(query)
|
| 523 |
st.session_state['chat_history_page3'].append(("User", query, "new"))
|
| 524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
-
# Query the CromA collection
|
| 527 |
-
results = collection.query(
|
| 528 |
-
query_texts=[full_query],
|
| 529 |
-
n_results=5
|
| 530 |
-
)
|
| 531 |
-
|
| 532 |
-
# Process and display response from CromA results
|
| 533 |
-
if results and results['documents']:
|
| 534 |
-
try:
|
| 535 |
-
# Accessing the first document of the first result
|
| 536 |
-
top_document = results['documents'][0][0] # Adjusted access
|
| 537 |
-
response = f"Top result: {top_document}"
|
| 538 |
-
except KeyError as ke:
|
| 539 |
-
st.error(f"KeyError encountered: {ke}")
|
| 540 |
-
response = "Error in processing the response."
|
| 541 |
-
else:
|
| 542 |
-
response = "No results found for your query."
|
| 543 |
-
|
| 544 |
st.session_state['chat_history_page3'].append(("Eve", response, "new"))
|
| 545 |
|
| 546 |
|
|
@@ -551,7 +563,38 @@ def page3():
|
|
| 551 |
st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
|
| 552 |
|
| 553 |
except Exception as e:
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
|
| 556 |
|
| 557 |
def page4():
|
|
|
|
| 56 |
clone_from="Anne31415/Chat_Store", # Replace with your repository URL
|
| 57 |
token=os.environ["HUB_TOKEN"] # Use the secret token to authenticate
|
| 58 |
)
|
| 59 |
+
repo2.git_pull() # Pull the latest changes (if any)
|
| 60 |
|
| 61 |
|
| 62 |
# Step 2: Load the PDF File
|
|
|
|
| 69 |
api_key = os.getenv("OPENAI_API_KEY")
|
| 70 |
# Retrieve the API key from st.secrets
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
@st.cache_resource
|
|
|
|
| 116 |
return VectorStore
|
| 117 |
|
| 118 |
|
| 119 |
+
|
| 120 |
+
|
| 121 |
# Utility function to load text from a PDF
|
| 122 |
def load_pdf_text(file_path):
|
| 123 |
pdf_reader = PdfReader(file_path)
|
|
|
|
| 126 |
text += page.extract_text() or "" # Add fallback for pages where text extraction fails
|
| 127 |
return text
|
| 128 |
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# Utility function to load text from a PDF and split it into pages
|
| 132 |
+
def load_pdf_text_by_page(file_path):
|
| 133 |
+
pdf_reader = PdfReader(file_path)
|
| 134 |
+
pages_text = []
|
| 135 |
+
for page in pdf_reader.pages:
|
| 136 |
+
# Extract text for each page and add it to the list
|
| 137 |
+
page_text = page.extract_text() or "" # Add fallback for pages where text extraction fails
|
| 138 |
+
pages_text.append(page_text)
|
| 139 |
+
return pages_text
|
| 140 |
+
|
| 141 |
+
# Use the new function to get a list of texts, each representing a page
|
| 142 |
+
pdf_pages = load_pdf_text_by_page(pdf_path3)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
def load_chatbot():
|
| 146 |
#return load_qa_chain(llm=OpenAI(), chain_type="stuff")
|
| 147 |
return load_qa_chain(llm=OpenAI(model_name="gpt-3.5-turbo-instruct"), chain_type="stuff")
|
|
|
|
| 253 |
session_id = st.session_state['session_id']
|
| 254 |
st.sidebar.markdown(f"**Ihre Session ID:** `{session_id}`")
|
| 255 |
st.sidebar.markdown("Verwenden Sie diese ID als Referenz bei Mitteilungen oder Rückmeldungen.")
|
| 256 |
+
|
| 257 |
+
def preprocess_and_store_pdf_text(pdf_path, collection, text_splitter):
|
| 258 |
+
|
| 259 |
+
# Load and split the PDF text
|
| 260 |
+
text = load_pdf_text(pdf_path)
|
| 261 |
+
chunks = text_splitter.split_text(text=text)
|
| 262 |
|
| 263 |
+
# Store each chunk as a separate document in CromA DB
|
| 264 |
+
for i, chunk in enumerate(chunks):
|
| 265 |
+
document_id = f"Chunk_{i+1}"
|
| 266 |
+
collection.add(documents=[chunk], ids=[document_id])
|
| 267 |
|
| 268 |
def page1():
|
| 269 |
try:
|
|
|
|
| 507 |
|
| 508 |
|
| 509 |
|
| 510 |
+
|
| 511 |
+
|
| 512 |
def page3():
|
| 513 |
try:
|
| 514 |
# Basic layout setup
|
| 515 |
st.title("Kosten- und Strukturdaten der Krankenhäuser")
|
| 516 |
|
| 517 |
+
|
| 518 |
+
# Initialize text splitter
|
| 519 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200, length_function=len)
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
|
| 523 |
# Initialize CromA client and handle collection
|
| 524 |
chroma_client = chromadb.Client()
|
| 525 |
try:
|
|
|
|
| 532 |
|
| 533 |
# Add documents to the collection if not already done
|
| 534 |
if "documents_added" not in st.session_state:
|
| 535 |
+
preprocess_and_store_pdf_text(pdf_path3, collection, text_splitter)
|
|
|
|
|
|
|
|
|
|
| 536 |
st.session_state["documents_added"] = True
|
| 537 |
|
| 538 |
# Display chat history
|
|
|
|
| 545 |
full_query = ask_bot(query)
|
| 546 |
st.session_state['chat_history_page3'].append(("User", query, "new"))
|
| 547 |
|
| 548 |
+
# Query the CromA collection with error handling
|
| 549 |
+
try:
|
| 550 |
+
results = collection.query(query_texts=[full_query], n_results=5)
|
| 551 |
+
response = process_croma_results(results)
|
| 552 |
+
except Exception as query_exception:
|
| 553 |
+
log_error(f"CromA DB query error: {query_exception}") # Logging function to be implemented
|
| 554 |
+
response = "An error occurred while processing your query."
|
| 555 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
st.session_state['chat_history_page3'].append(("Eve", response, "new"))
|
| 557 |
|
| 558 |
|
|
|
|
| 563 |
st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
|
| 564 |
|
| 565 |
except Exception as e:
|
| 566 |
+
log_error(f"General error in page3: {e}") # Log general errors
|
| 567 |
+
st.error(f"An unexpected error occurred: {repr(e)}")
|
| 568 |
+
|
| 569 |
+
def log_error(message):
|
| 570 |
+
"""
|
| 571 |
+
Logs an error message. Can be enhanced to write to a file or external logging service.
|
| 572 |
+
"""
|
| 573 |
+
# Example: Print to console, can be replaced with file logging or external service logging
|
| 574 |
+
print(message)
|
| 575 |
+
|
| 576 |
+
def process_croma_results(results):
|
| 577 |
+
"""
|
| 578 |
+
Process the query results from CromA DB and generate a response.
|
| 579 |
+
"""
|
| 580 |
+
if results and results['documents']:
|
| 581 |
+
try:
|
| 582 |
+
# Example processing: Extract and concatenate texts from top documents
|
| 583 |
+
top_documents = results['documents'][0] # Adjusted access
|
| 584 |
+
response_texts = [doc['text'] for doc in top_documents if 'text' in doc]
|
| 585 |
+
response = " ".join(response_texts[:3]) # Limiting to top 3 documents for brevity
|
| 586 |
+
except KeyError as ke:
|
| 587 |
+
response = "Error in processing the response."
|
| 588 |
+
else:
|
| 589 |
+
response = "No results found for your query."
|
| 590 |
+
return response
|
| 591 |
+
|
| 592 |
+
# TODO: Implement additional error handling and logging
|
| 593 |
+
# TODO: Review for security and performance improvements
|
| 594 |
+
|
| 595 |
+
# This is a modified snippet focusing on the querying and response handling for CromA DB.
|
| 596 |
+
# The full integration requires updating the main application code.
|
| 597 |
+
|
| 598 |
|
| 599 |
|
| 600 |
def page4():
|