Spaces:
Sleeping
Sleeping
Pranjal Gupta
commited on
Commit
·
1d93192
1
Parent(s):
6c68252
pdf uploader
Browse files
app.py
CHANGED
|
@@ -11,14 +11,14 @@ from langchain_core.output_parsers import StrOutputParser
|
|
| 11 |
from langchain_ollama import ChatOllama
|
| 12 |
from langchain_core.documents import Document
|
| 13 |
from langchain_community.llms import HuggingFacePipeline
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
# Initialize in-memory ChromaDB client
|
| 17 |
-
# This client runs entirely within the app.py script.
|
| 18 |
client = chromadb.Client()
|
| 19 |
|
| 20 |
# Load your embeddings model
|
| 21 |
-
model_kwargs = {"device": "cpu"}
|
| 22 |
encode_kwargs = {"normalize_embeddings": True}
|
| 23 |
embeddings = HuggingFaceEmbeddings(
|
| 24 |
model_name="sentence-transformers/paraphrase-distilroberta-base-v1",
|
|
@@ -27,18 +27,28 @@ embeddings = HuggingFaceEmbeddings(
|
|
| 27 |
)
|
| 28 |
|
| 29 |
# Initialize the vector DB using the in-memory client
|
| 30 |
-
# You'll need to embed your documents here. In a real-world app, you'd load them from a file.
|
| 31 |
-
# For a demo, let's create a dummy document.
|
| 32 |
vectorDB = Chroma(
|
| 33 |
client=client,
|
| 34 |
collection_name="embeddings",
|
| 35 |
embedding_function=embeddings,
|
| 36 |
)
|
| 37 |
-
# Example of adding a document. You would replace this with your actual documents.
|
| 38 |
-
sample_doc = "This is a sample document about the history of artificial intelligence. It was created to demonstrate the RAG pipeline."
|
| 39 |
-
vectorDB.add_documents([Document(page_content=sample_doc, metadata={"docId": "my_doc_id"})])
|
| 40 |
|
| 41 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def using_ollama_model(retriever, query, results, conversation_history):
|
| 43 |
history_text = ""
|
| 44 |
for item in conversation_history:
|
|
@@ -60,8 +70,7 @@ def using_ollama_model(retriever, query, results, conversation_history):
|
|
| 60 |
)
|
| 61 |
|
| 62 |
doc_texts = "\\n".join([doc.page_content for doc in results])
|
| 63 |
-
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
| 64 |
-
# llm = ChatOllama(model="llama3.2", temperature=0.4, num_predict=512)
|
| 65 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 66 |
model = AutoModelForCausalLM.from_pretrained(model_id)
|
| 67 |
pipe = pipeline(
|
|
@@ -75,10 +84,8 @@ def using_ollama_model(retriever, query, results, conversation_history):
|
|
| 75 |
repetition_penalty=1.2
|
| 76 |
)
|
| 77 |
|
| 78 |
-
# Use the pipeline with LangChain's HuggingFacePipeline
|
| 79 |
llm = HuggingFacePipeline(pipeline=pipe)
|
| 80 |
|
| 81 |
-
|
| 82 |
rag_chain = template | llm | StrOutputParser()
|
| 83 |
|
| 84 |
answer = rag_chain.invoke({"history": history_text, "results": doc_texts, "query": query})
|
|
@@ -107,22 +114,48 @@ def retrievingReponse(docId, query, conversation_history):
|
|
| 107 |
llm_result = using_ollama_model(retriever, query, results, conversation_history)
|
| 108 |
return llm_result
|
| 109 |
|
| 110 |
-
# Gradio
|
| 111 |
-
def gradio_rag_wrapper(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
rag_history = []
|
| 113 |
for user_msg, bot_msg in history:
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
docId = "
|
| 117 |
-
response = retrievingReponse(docId,
|
| 118 |
|
| 119 |
return response
|
| 120 |
|
|
|
|
| 121 |
demo = gr.ChatInterface(
|
| 122 |
fn=gradio_rag_wrapper,
|
|
|
|
| 123 |
title="Contextual RAG Chatbot on Hugging Face Spaces",
|
| 124 |
-
description="
|
|
|
|
| 125 |
)
|
| 126 |
|
| 127 |
if __name__ == "__main__":
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
from langchain_ollama import ChatOllama
|
| 12 |
from langchain_core.documents import Document
|
| 13 |
from langchain_community.llms import HuggingFacePipeline
|
| 14 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 15 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 16 |
|
| 17 |
# Initialize in-memory ChromaDB client
|
|
|
|
| 18 |
client = chromadb.Client()
|
| 19 |
|
| 20 |
# Load your embeddings model
|
| 21 |
+
model_kwargs = {"device": "cpu"}
|
| 22 |
encode_kwargs = {"normalize_embeddings": True}
|
| 23 |
embeddings = HuggingFaceEmbeddings(
|
| 24 |
model_name="sentence-transformers/paraphrase-distilroberta-base-v1",
|
|
|
|
| 27 |
)
|
| 28 |
|
| 29 |
# Initialize the vector DB using the in-memory client
|
|
|
|
|
|
|
| 30 |
vectorDB = Chroma(
|
| 31 |
client=client,
|
| 32 |
collection_name="embeddings",
|
| 33 |
embedding_function=embeddings,
|
| 34 |
)
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
# Function to process and ingest a PDF file
|
| 37 |
+
def process_pdf(file_path):
|
| 38 |
+
# Use PyPDFLoader to load the PDF
|
| 39 |
+
loader = PyPDFLoader(file_path)
|
| 40 |
+
documents = loader.load()
|
| 41 |
+
|
| 42 |
+
# Split the documents for better retrieval
|
| 43 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=30)
|
| 44 |
+
texts = text_splitter.split_documents(documents)
|
| 45 |
+
|
| 46 |
+
# Ingest into the vector store
|
| 47 |
+
# Note: A docId is added to group these documents
|
| 48 |
+
vectorDB.add_documents(texts, [{"docId": "uploaded_doc"}] * len(texts))
|
| 49 |
+
gr.Info("PDF processed and ready for questions!")
|
| 50 |
+
|
| 51 |
+
# Your existing functions
|
| 52 |
def using_ollama_model(retriever, query, results, conversation_history):
|
| 53 |
history_text = ""
|
| 54 |
for item in conversation_history:
|
|
|
|
| 70 |
)
|
| 71 |
|
| 72 |
doc_texts = "\\n".join([doc.page_content for doc in results])
|
| 73 |
+
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
|
|
|
| 74 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 75 |
model = AutoModelForCausalLM.from_pretrained(model_id)
|
| 76 |
pipe = pipeline(
|
|
|
|
| 84 |
repetition_penalty=1.2
|
| 85 |
)
|
| 86 |
|
|
|
|
| 87 |
llm = HuggingFacePipeline(pipeline=pipe)
|
| 88 |
|
|
|
|
| 89 |
rag_chain = template | llm | StrOutputParser()
|
| 90 |
|
| 91 |
answer = rag_chain.invoke({"history": history_text, "results": doc_texts, "query": query})
|
|
|
|
| 114 |
llm_result = using_ollama_model(retriever, query, results, conversation_history)
|
| 115 |
return llm_result
|
| 116 |
|
| 117 |
+
# The revised Gradio wrapper function
|
| 118 |
+
def gradio_rag_wrapper(message, history):
|
| 119 |
+
# Check if a file has been uploaded
|
| 120 |
+
# 'message' is a dictionary due to `multimodal=True`
|
| 121 |
+
uploaded_files = message.get("files", [])
|
| 122 |
+
|
| 123 |
+
# Process the PDF if it exists
|
| 124 |
+
if uploaded_files:
|
| 125 |
+
for file_path in uploaded_files:
|
| 126 |
+
process_pdf(file_path)
|
| 127 |
+
# Return a message to confirm the upload
|
| 128 |
+
return "PDF uploaded and processed. You can now ask questions about the content."
|
| 129 |
+
|
| 130 |
+
# Process the text query
|
| 131 |
+
text_query = message.get("text", "")
|
| 132 |
+
if not text_query.strip():
|
| 133 |
+
# Handle cases where only a file was uploaded
|
| 134 |
+
return "Please upload a document or enter a text query."
|
| 135 |
+
|
| 136 |
rag_history = []
|
| 137 |
for user_msg, bot_msg in history:
|
| 138 |
+
# Note: You need to extract the text from user messages which may contain files
|
| 139 |
+
user_text = user_msg.get("text", "") if isinstance(user_msg, dict) else user_msg
|
| 140 |
+
|
| 141 |
+
rag_history.append({"question": user_text, "answer": bot_msg})
|
| 142 |
|
| 143 |
+
docId = "uploaded_doc" # Use the docId from the uploaded file
|
| 144 |
+
response = retrievingReponse(docId, text_query, rag_history)
|
| 145 |
|
| 146 |
return response
|
| 147 |
|
| 148 |
+
# Create the Gradio interface with multimodal input
|
| 149 |
demo = gr.ChatInterface(
|
| 150 |
fn=gradio_rag_wrapper,
|
| 151 |
+
multimodal=True, # This enables file upload
|
| 152 |
title="Contextual RAG Chatbot on Hugging Face Spaces",
|
| 153 |
+
description="Upload a PDF file to start chatting!",
|
| 154 |
+
textbox=gr.MultimodalTextbox(file_types=[".pdf"]), # Restrict file types
|
| 155 |
)
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
| 158 |
+
# Create a dummy doc for initial testing if no PDF is uploaded
|
| 159 |
+
vectorDB.add_documents([Document(page_content="This is a sample document about the history of artificial intelligence. It was created to demonstrate the RAG pipeline.", metadata={"docId": "uploaded_doc"})])
|
| 160 |
+
|
| 161 |
+
demo.launch()
|