Spaces:

Pixeltable
/

AI-Chatbot-With-Retrieval-Augmented-Generation

Running

App Files Files Community

PierreBrunelle commited on Oct 7, 2024

Commit

e1aa0dd

verified ·

1 Parent(s): 514c787

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import gradio as gr
+import pandas as pd
+import io
+import base64
+import uuid
+import pixeltable as pxt
+from pixeltable.iterators import DocumentSplitter
+import numpy as np
+from pixeltable.functions.huggingface import sentence_transformer
+from pixeltable.functions import openai
+from gradio.themes import Monochrome
+import os
+import getpass
+# Store API keys
+if 'OPENAI_API_KEY' not in os.environ:
+    os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API key:')
+# Set up embedding function
+@pxt.expr_udf
+def e5_embed(text: str) -> np.ndarray:
+    return sentence_transformer(text, model_id='intfloat/e5-large-v2')
+# Create prompt function
+@pxt.udf
+def create_prompt(top_k_list: list[dict], question: str) -> str:
+    concat_top_k = '\n\n'.join(
+        elt['text'] for elt in reversed(top_k_list)
+    )
+    return f'''
+    PASSAGES:
+    {concat_top_k}
+    QUESTION:
+    {question}'''
+def process_files(pdf_files, chunk_limit, chunk_separator):
+    # Initialize Pixeltable
+    pxt.drop_dir('chatbot_demo', force=True)
+    pxt.create_dir('chatbot_demo')
+    # Create a table to store the uploaded PDF documents
+    t = pxt.create_table(
+    'chatbot_demo.documents',
+    {'document': pxt.DocumentType(nullable=True),
+     'question': pxt.StringType(nullable=True)}
+    )
+    # Insert the PDF files into the documents table
+    t.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
+    # Create a view that splits the documents into smaller chunks
+    chunks_t = pxt.create_view(
+        'chatbot_demo.chunks',
+        t,
+        iterator=DocumentSplitter.create(
+            document=t.document,
+            separators=chunk_separator,
+            limit=chunk_limit if chunk_separator in ["token_limit", "char_limit"] else None,
+            metadata='title,heading,sourceline'
+        )
+    )
+    # Add an embedding index to the chunks for similarity search
+    chunks_t.add_embedding_index('text', string_embed=e5_embed)
+    try:
+      @chunks_t.query
+      def top_k(query_text: str):
+          sim = chunks_t.text.similarity(query_text)
+          return (
+              chunks_t.order_by(sim, asc=False)
+                  .select(chunks_t.text, sim=sim)
+                  .limit(5)
+          )
+    except Exception:
+      pass
+     # Add computed columns to the table for context retrieval and prompt creation
+    t['question_context'] = chunks_t.top_k(t.question)
+    t['prompt'] = create_prompt(
+        t.question_context, t.question
+    )
+    # Prepare messages for the API
+    msgs = [
+        {
+            'role': 'system',
+            'content': 'Read the following passages and answer the question based on their contents.'
+        },
+        {
+            'role': 'user',
+            'content': t.prompt
+        }
+    ]
+    # Add OpenAI response column
+    t['response'] = openai.chat_completions(
+        model='gpt-4o-mini-2024-07-18',
+        messages=msgs,
+        max_tokens=300,
+        top_p=0.9,
+        temperature=0.7
+    )
+    # Extract the answer text from the API response
+    t['gpt4omini'] = t.response.choices[0].message.content
+    return "Files processed successfully!"
+def get_answer(msg):
+    t = pxt.get_table('chatbot_demo.documents')
+    chunks_t = pxt.get_table('chatbot_demo.chunks')
+    # Insert the question into the table
+    t.insert([{'question': msg}])
+    answer = t.select(t.gpt4omini).tail(1)['gpt4omini'][0]
+    return answer
+# Gradio interface
+with gr.Blocks(theme=Monochrome()) as demo:
+    gr.Markdown(
+        """
+        <div>
+            <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 20px;" />
+            <h1 style="margin-bottom: 0.5em;">AI Chatbot With Retrieval-Augmented Generation (RAG)</h1>
+        </div>
+        """
+    )
+    gr.HTML(
+        """
+        <p>
+            <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">Pixeltable</a> is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data.
+        </p>
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            pdf_files = gr.File(label="Upload PDF Documents", file_count="multiple")
+            chunk_limit = gr.Slider(minimum=100, maximum=500, value=300, step=5, label="Chunk Size Limit (only used when the separator is token_/char_limit)")
+            chunk_separator = gr.Dropdown(
+                choices=["token_limit", "char_limit", "sentence", "paragraph", "heading"],
+                value="token_limit",
+                label="Chunk Separator"
+            )
+            process_button = gr.Button("Process Files")
+            process_output = gr.Textbox(label="Processing Output")
+        with gr.Column():
+            chatbot = gr.Chatbot(label="Chat History")
+            msg = gr.Textbox(label="Your Question")
+            submit = gr.Button("Submit")
+    def respond(message, chat_history):
+        bot_message = get_answer(message)
+        chat_history.append((message, bot_message))
+        return "", chat_history
+    submit.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
+    process_button.click(process_files, inputs=[pdf_files, chunk_limit, chunk_separator], outputs=[process_output])
+if __name__ == "__main__":
+    demo.launch(debug=True)