chatbot-pdf-gpt4key-langchain-chroma-prompttemp-tabs-dataframe-ocrmypdf-sqlite-csv-returns-json

Sleeping

App Files Files Community

lekkalar commited on Aug 23, 2023

Commit

bead70d

1 Parent(s): 928a91a

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -48

app.py CHANGED Viewed

@@ -14,6 +14,55 @@ from langchain.chat_models import ChatOpenAI # the LLM model we'll use (ChatGPT)
 from langchain import PromptTemplate
 def create_db_connection():
     DB_FILE = "./questionset.db"
     connection = sqlite3.connect(DB_FILE,check_same_thread=False)
@@ -162,53 +211,6 @@ def add_questionset(data, document_type, tag_for_questionset):
     connection.commit()
     connection.close()
-def load_pdf_and_generate_embeddings(pdf_doc, relevant_pages):
-    os.environ['OPENAI_API_KEY'] = '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'
-    #OCR Conversion - skips conversion of pages that already contain text
-    pdf_doc = ocr_converter(pdf_doc)
-    #Load the pdf file
-    loader = OnlinePDFLoader(pdf_doc)
-    pages = loader.load_and_split()
-    print('pages loaded:', len(pages))
-    #Create an instance of OpenAIEmbeddings, which is responsible for generating embeddings for text
-    embeddings = OpenAIEmbeddings()
-    pages_to_be_loaded =[]
-    if relevant_pages:
-        page_numbers = relevant_pages.split(",")
-        if len(page_numbers) != 0:
-            for page_number in page_numbers:
-                if page_number.isdigit():
-                    pageIndex = int(page_number)-1
-                    if pageIndex >=0 and pageIndex <len(pages):
-                        pages_to_be_loaded.append(pages[pageIndex])
-    #In the scenario where none of the page numbers supplied exist in the PDF, we will revert to using the entire PDF.
-    if len(pages_to_be_loaded) ==0:
-        pages_to_be_loaded = pages.copy()
-    #To create a vector store, we use the Chroma class, which takes the documents (pages in our case) and the embeddings instance
-    vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
-    #Finally, we create the bot using the RetrievalQA class
-    global pdf_qa
-    prompt_template = """Use the following pieces of context to answer the question at the end. If you do not know the answer, just return N/A. If you encounter a date, return it in mm/dd/yyyy format.
-    {context}
-    Question: {question}
-    Return just the answer. Provide the answer in the JSON format and extract the key from the question :"""
-    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
-    chain_type_kwargs = {"prompt": PROMPT}
-    pdf_qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, model_name="gpt-4"),chain_type="stuff", retriever=vectordb.as_retriever(search_kwargs={"k": 5}), chain_type_kwargs=chain_type_kwargs, return_source_documents=False)
-    return "Ready"
 def load_csv_and_store_questionset_into_sqlite(csv_file, document_type, tag_for_questionset):
     print('document type is:',document_type)
     print('tag_for_questionset is:',tag_for_questionset)
@@ -270,7 +272,7 @@ title = """
     <h1>Chatbot for PDFs - GPT-4</h1>
     <p style="text-align: center;">Upload a .PDF, click the "Upload PDF and generate embeddings" button, <br />
     Wait for the Status to show Ready. You can chose to get answers to the pre-defined question set OR ask your own question <br />
-    The app is built on GPT-4 and leverages PromptTemplate</p>
 </div>
 """
@@ -280,6 +282,7 @@ with gr.Blocks(css=css,theme=gr.themes.Monochrome()) as demo:
     with gr.Tab("Chatbot"):
         with gr.Column():
             pdf_doc = gr.File(label="Load a pdf",file_types=['.pdf'],type='file')
             relevant_pages = gr.Textbox(label="*Optional - List comma separated page numbers to load or leave this field blank to use the entire PDF")

 from langchain import PromptTemplate
+def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
+    if open_ai_key is not None:
+        os.environ['OPENAI_API_KEY'] = open_ai_key
+        #OCR Conversion - skips conversion of pages that already contain text
+        pdf_doc = ocr_converter(pdf_doc)
+        #Load the pdf file
+        loader = OnlinePDFLoader(pdf_doc)
+        pages = loader.load_and_split()
+        print('pages loaded:', len(pages))
+        #Create an instance of OpenAIEmbeddings, which is responsible for generating embeddings for text
+        embeddings = OpenAIEmbeddings()
+        pages_to_be_loaded =[]
+        if relevant_pages:
+            page_numbers = relevant_pages.split(",")
+            if len(page_numbers) != 0:
+                for page_number in page_numbers:
+                    if page_number.isdigit():
+                        pageIndex = int(page_number)-1
+                        if pageIndex >=0 and pageIndex <len(pages):
+                            pages_to_be_loaded.append(pages[pageIndex])
+        #In the scenario where none of the page numbers supplied exist in the PDF, we will revert to using the entire PDF.
+        if len(pages_to_be_loaded) ==0:
+            pages_to_be_loaded = pages.copy()
+        #To create a vector store, we use the Chroma class, which takes the documents (pages in our case) and the embeddings instance
+        vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
+        #Finally, we create the bot using the RetrievalQA class
+        global pdf_qa
+        prompt_template = """Use the following pieces of context to answer the question at the end. If you do not know the answer, just return N/A. If you encounter a date, return it in mm/dd/yyyy format.
+        {context}
+        Question: {question}
+        Return just the answer. Provide the answer in the JSON format and extract the key from the question :"""
+        PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+        chain_type_kwargs = {"prompt": PROMPT}
+        pdf_qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, model_name="gpt-4"),chain_type="stuff", retriever=vectordb.as_retriever(search_kwargs={"k": 5}), chain_type_kwargs=chain_type_kwargs, return_source_documents=False)
+        return "Ready"
+    else:
+        return "Please provide an OpenAI gpt-4 API key"
 def create_db_connection():
     DB_FILE = "./questionset.db"
     connection = sqlite3.connect(DB_FILE,check_same_thread=False)
     connection.commit()
     connection.close()
 def load_csv_and_store_questionset_into_sqlite(csv_file, document_type, tag_for_questionset):
     print('document type is:',document_type)
     print('tag_for_questionset is:',tag_for_questionset)
     <h1>Chatbot for PDFs - GPT-4</h1>
     <p style="text-align: center;">Upload a .PDF, click the "Upload PDF and generate embeddings" button, <br />
     Wait for the Status to show Ready. You can chose to get answers to the pre-defined question set OR ask your own question <br />
+    The app is built on GPT-4 and leverages the magic of PromptTemplate</p>
 </div>
 """
     with gr.Tab("Chatbot"):
         with gr.Column():
+            openai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
             pdf_doc = gr.File(label="Load a pdf",file_types=['.pdf'],type='file')
             relevant_pages = gr.Textbox(label="*Optional - List comma separated page numbers to load or leave this field blank to use the entire PDF")