LIDA2and1_csv

Sleeping

App Files Files Community

LIDA2and1_csv / app.py

Anne31415

Update app.py

161353e about 2 years ago

raw

history blame

6.27 kB

	import os
	import streamlit.components.v1 as components
	from datasets import load_dataset
	import random
	import pickle
	from nltk.tokenize import sent_tokenize
	import nltk
	from PyPDF2 import PdfReader
	import streamlit as st
	from streamlit_extras.add_vertical_space import add_vertical_space
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.llms import OpenAI
	from langchain.chains.question_answering import load_qa_chain
	from langchain.callbacks import get_openai_callback
	from my_component import my_component

	nltk.download('punkt')



	# Sidebar contents
	with st.sidebar:
	st.title(':orange_book: BinDoc GmbH')


	api_key = os.getenv("OPENAI_API_KEY")
	# Retrieve the API key from st.secrets


	if not api_key:
	st.warning('API key is required to proceed.')
	st.stop() # Stop the app if the API key is not provided

	st.markdown("Experience the future of document interaction with the revolutionary")
	st.markdown("BinDocs Chat App.")
	st.markdown("Harnessing the power of a Large Language Model and AI technology,")
	st.markdown("this innovative platform redefines PDF engagement,")
	st.markdown("enabling dynamic conversations that bridge the gap between")
	st.markdown("human and machine intelligence.")

	add_vertical_space(3) # Add more vertical space between text blocks
	st.write('Made with ❤️ by BinDoc GmbH')

	def load_pdf(file_path):
	pdf_reader = PdfReader(file_path)
	chunks = []
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text:
	chunks.append(text)

	store_name = file_path.name[:-4]

	if os.path.exists(f"{store_name}.pkl"):
	with open(f"{store_name}.pkl", "rb") as f:
	VectorStore = pickle.load(f)
	else:
	embeddings = OpenAIEmbeddings()
	VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
	with open(f"{store_name}.pkl", "wb") as f:
	pickle.dump(VectorStore, f)

	return VectorStore

	def load_chatbot(max_tokens=300):
	return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff")


	def display_chat_history(chat_history):
	for chat in chat_history:
	background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
	st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

	def remove_incomplete_sentences(text):
	sentences = sent_tokenize(text)
	complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
	return ' '.join(complete_sentences)

	def remove_redundant_information(text):
	sentences = sent_tokenize(text)
	unique_sentences = list(set(sentences))
	return ' '.join(unique_sentences)

	# Define a maximum token limit to avoid infinite loops
	MAX_TOKEN_LIMIT = 400

	import random


	def main():
	st.title("BinDocs Chat App")

	# Step 1: Adding CSS for rounded boxes
	st.markdown("""
	<style>
	.question-box {
	border: 1px solid orange;
	border-radius: 15px;
	padding: 10px;
	text-align: center;
	cursor: pointer;
	display: inline-block;
	width: 45%;
	margin: 2%;
	}
	</style>
	""", unsafe_allow_html=True)

	if "chat_history" not in st.session_state:
	st.session_state['chat_history'] = []

	display_chat_history(st.session_state['chat_history'])

	new_messages_placeholder = st.empty()

	pdf = st.file_uploader("Upload your PDF", type="pdf")

	query = st.text_input("Ask questions about your PDF file (in any preferred language):")

	if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
	if pdf is not None:
	st.session_state['last_input'] = query
	st.session_state['chat_history'].append(("User", query, "new"))

	loading_message = st.empty()
	loading_message.text('Bot is thinking...')

	VectorStore = load_pdf(pdf)
	max_tokens = 120
	chain = load_chatbot(max_tokens=max_tokens)
	docs = VectorStore.similarity_search(query=query, k=2)

	with get_openai_callback() as cb:
	response = chain.run(input_documents=docs, question=query)

	# Post-processing to remove incomplete sentences and redundant information
	filtered_response = remove_incomplete_sentences(response)
	filtered_response = remove_redundant_information(filtered_response)

	st.session_state['chat_history'].append(("Bot", filtered_response, "new"))

	new_messages = st.session_state['chat_history'][-2:]
	for chat in new_messages:
	background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
	new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

	st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)

	loading_message.empty()

	query = ""
	else:
	st.warning("Please upload a PDF file before asking questions.")

	st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]




	# Displaying example questions
	if not st.session_state['chat_history']:
	st.markdown("""
	<div class="question-box" id="question1">Was genau ist ein Belegarzt?</div>
	<div class="question-box" id="question2">Wofür wird die Alpha-ID verwendet?</div>
	<br>
	<div class="question-box" id="question3">Was sind die Vorteile des ambulanten operierens?</div>
	""", unsafe_allow_html=True)

	my_component()

	if __name__ == "__main__":
	main()