Srikesh commited on
Commit
4f2bd66
Β·
verified Β·
1 Parent(s): e4410fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -84
app.py CHANGED
@@ -1,29 +1,60 @@
1
  import gradio as gr
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain_community.vectorstores import FAISS
4
- from langchain_huggingface import HuggingFaceEmbeddings
 
5
  from langchain.chains import ConversationalRetrievalChain
6
- from langchain_huggingface import HuggingFaceEndpoint
 
7
  from pypdf import PdfReader
8
- import os
9
- from huggingface_hub import login
10
 
11
  # Initialize global variables
12
  vectorstore = None
13
  qa_chain = None
14
- chat_history = []
15
 
16
- def process_pdf(pdf_file, hf_token):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  """Process uploaded PDF and create vector store"""
18
- global vectorstore, qa_chain, chat_history
19
 
20
- if not hf_token:
21
- return "Please provide your Hugging Face API token!", None
22
 
23
  try:
24
- # Login to Hugging Face
25
- login(token=hf_token)
26
-
27
  # Extract text from PDF
28
  pdf_reader = PdfReader(pdf_file.name)
29
  text = ""
@@ -31,7 +62,7 @@ def process_pdf(pdf_file, hf_token):
31
  text += page.extract_text()
32
 
33
  if not text.strip():
34
- return "Could not extract text from PDF. Please ensure it's a valid PDF with text content.", None
35
 
36
  # Split text into chunks
37
  text_splitter = RecursiveCharacterTextSplitter(
@@ -41,69 +72,68 @@ def process_pdf(pdf_file, hf_token):
41
  )
42
  chunks = text_splitter.split_text(text)
43
 
44
- # Create embeddings
45
  embeddings = HuggingFaceEmbeddings(
46
- model_name="sentence-transformers/all-MiniLM-L6-v2"
 
47
  )
48
 
49
  # Create vector store
50
  vectorstore = FAISS.from_texts(chunks, embeddings)
51
 
52
- # Initialize LLM
53
- llm = HuggingFaceEndpoint(
54
- repo_id="mistralai/Mistral-7B-Instruct-v0.2",
55
- temperature=0.7,
56
- max_new_tokens=512,
57
- huggingfacehub_api_token=hf_token
 
 
58
  )
59
 
60
  # Create conversational chain
61
  qa_chain = ConversationalRetrievalChain.from_llm(
62
- llm=llm,
63
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
 
64
  return_source_documents=True,
65
  verbose=False
66
  )
67
 
68
- # Reset chat history
69
- chat_history = []
70
-
71
- return f"PDF processed successfully! Extracted {len(chunks)} text chunks. You can now ask questions!", None
72
 
73
  except Exception as e:
74
- return f"Error processing PDF: {str(e)}", None
75
 
76
  def chat(message, history):
77
  """Handle chat interactions"""
78
- global qa_chain, chat_history
79
 
80
  if qa_chain is None:
81
- return "Please upload and process a PDF first!"
82
 
83
  if not message.strip():
84
- return "Please enter a question!"
85
 
86
  try:
87
  # Get response from chain
88
- result = qa_chain({
89
- "question": message,
90
- "chat_history": chat_history
91
- })
92
-
93
  answer = result["answer"]
94
 
95
- # Update chat history
96
- chat_history.append((message, answer))
 
97
 
98
- return answer
99
 
100
  except Exception as e:
101
- return f"Error: {str(e)}"
102
 
103
  def clear_chat():
104
- """Clear chat history"""
105
- global chat_history
106
- chat_history = []
 
107
  return None
108
 
109
  # Create Gradio interface
@@ -111,82 +141,110 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Chat with PDF") as demo:
111
  gr.Markdown(
112
  """
113
  # πŸ“„ Chat with PDF using AI
114
- Upload a PDF document and ask questions about its content!
115
 
116
  **Instructions:**
117
- 1. Enter your Hugging Face API token (get one from [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens))
118
- 2. Upload a PDF file
119
- 3. Click "Process PDF"
120
- 4. Start asking questions about your document!
121
  """
122
  )
123
 
124
  with gr.Row():
125
  with gr.Column(scale=1):
126
- hf_token = gr.Textbox(
127
- label="Hugging Face API Token",
128
- type="password",
129
- placeholder="hf_..."
130
- )
131
  pdf_input = gr.File(
132
- label="Upload PDF",
133
- file_types=[".pdf"]
 
134
  )
135
- process_btn = gr.Button("Process PDF", variant="primary")
136
  status_output = gr.Textbox(
137
- label="Status",
138
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
139
  )
140
 
141
  with gr.Column(scale=2):
142
  chatbot = gr.Chatbot(
143
- label="Chat History",
144
- height=400
145
- )
146
- msg = gr.Textbox(
147
- label="Your Question",
148
- placeholder="Ask a question about your PDF...",
149
- lines=2
150
  )
151
  with gr.Row():
152
- submit_btn = gr.Button("Send", variant="primary")
153
- clear_btn = gr.Button("Clear Chat")
 
 
 
 
 
 
 
154
 
155
  gr.Markdown(
156
  """
157
- ### Tips:
158
- - Ask specific questions about the content
159
- - You can ask follow-up questions
160
- - The AI will use context from previous messages
161
- - For best results, ensure your PDF has extractable text (not scanned images)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  """
163
  )
164
 
165
  # Event handlers
166
  process_btn.click(
167
  fn=process_pdf,
168
- inputs=[pdf_input, hf_token],
169
- outputs=[status_output, chatbot]
170
  )
171
 
172
- submit_btn.click(
173
  fn=chat,
174
  inputs=[msg, chatbot],
175
- outputs=[msg]
176
  ).then(
177
- fn=lambda m, h: (h + [[m, chat(m, h)]], ""),
178
- inputs=[msg, chatbot],
179
- outputs=[chatbot, msg]
180
  )
181
 
182
- msg.submit(
183
  fn=chat,
184
  inputs=[msg, chatbot],
185
- outputs=[msg]
186
  ).then(
187
- fn=lambda m, h: (h + [[m, chat(m, h)]], ""),
188
- inputs=[msg, chatbot],
189
- outputs=[chatbot, msg]
190
  )
191
 
192
  clear_btn.click(
@@ -194,5 +252,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Chat with PDF") as demo:
194
  outputs=[chatbot]
195
  )
196
 
 
 
 
197
  if __name__ == "__main__":
198
- demo.launch()
 
1
  import gradio as gr
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from langchain_community.vectorstores import FAISS
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+ from langchain_community.llms import HuggingFacePipeline
6
  from langchain.chains import ConversationalRetrievalChain
7
+ from langchain.memory import ConversationBufferMemory
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
9
  from pypdf import PdfReader
10
+ import torch
 
11
 
12
  # Initialize global variables
13
  vectorstore = None
14
  qa_chain = None
15
+ llm_pipeline = None
16
 
17
+ def initialize_llm():
18
+ """Initialize the language model (done once at startup)"""
19
+ global llm_pipeline
20
+
21
+ if llm_pipeline is not None:
22
+ return
23
+
24
+ print("Loading language model...")
25
+
26
+ # Use a smaller, efficient model that works without API
27
+ model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ model_name,
32
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
33
+ device_map="auto",
34
+ low_cpu_mem_usage=True
35
+ )
36
+
37
+ pipe = pipeline(
38
+ "text-generation",
39
+ model=model,
40
+ tokenizer=tokenizer,
41
+ max_new_tokens=512,
42
+ temperature=0.7,
43
+ top_p=0.95,
44
+ repetition_penalty=1.15
45
+ )
46
+
47
+ llm_pipeline = HuggingFacePipeline(pipeline=pipe)
48
+ print("Model loaded successfully!")
49
+
50
+ def process_pdf(pdf_file):
51
  """Process uploaded PDF and create vector store"""
52
+ global vectorstore, qa_chain
53
 
54
+ if pdf_file is None:
55
+ return "Please upload a PDF file!", None, None
56
 
57
  try:
 
 
 
58
  # Extract text from PDF
59
  pdf_reader = PdfReader(pdf_file.name)
60
  text = ""
 
62
  text += page.extract_text()
63
 
64
  if not text.strip():
65
+ return "Could not extract text from PDF. Please ensure it's a valid PDF with text content.", None, None
66
 
67
  # Split text into chunks
68
  text_splitter = RecursiveCharacterTextSplitter(
 
72
  )
73
  chunks = text_splitter.split_text(text)
74
 
75
+ # Create embeddings (using a lightweight model)
76
  embeddings = HuggingFaceEmbeddings(
77
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
78
+ model_kwargs={'device': 'cpu'}
79
  )
80
 
81
  # Create vector store
82
  vectorstore = FAISS.from_texts(chunks, embeddings)
83
 
84
+ # Initialize LLM if not already done
85
+ initialize_llm()
86
+
87
+ # Create memory for conversation
88
+ memory = ConversationBufferMemory(
89
+ memory_key="chat_history",
90
+ return_messages=True,
91
+ output_key="answer"
92
  )
93
 
94
  # Create conversational chain
95
  qa_chain = ConversationalRetrievalChain.from_llm(
96
+ llm=llm_pipeline,
97
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
98
+ memory=memory,
99
  return_source_documents=True,
100
  verbose=False
101
  )
102
 
103
+ return f"βœ… PDF processed successfully! Extracted {len(chunks)} text chunks. You can now ask questions!", None, None
 
 
 
104
 
105
  except Exception as e:
106
+ return f"❌ Error processing PDF: {str(e)}", None, None
107
 
108
  def chat(message, history):
109
  """Handle chat interactions"""
110
+ global qa_chain
111
 
112
  if qa_chain is None:
113
+ return history + [[message, "⚠️ Please upload and process a PDF first!"]]
114
 
115
  if not message.strip():
116
+ return history
117
 
118
  try:
119
  # Get response from chain
120
+ result = qa_chain({"question": message})
 
 
 
 
121
  answer = result["answer"]
122
 
123
+ # Clean up the answer (remove any system prompts)
124
+ if "Answer:" in answer:
125
+ answer = answer.split("Answer:")[-1].strip()
126
 
127
+ return history + [[message, answer]]
128
 
129
  except Exception as e:
130
+ return history + [[message, f"❌ Error: {str(e)}"]]
131
 
132
  def clear_chat():
133
+ """Clear chat history and reset chain"""
134
+ global qa_chain
135
+ if qa_chain is not None and hasattr(qa_chain, 'memory'):
136
+ qa_chain.memory.clear()
137
  return None
138
 
139
  # Create Gradio interface
 
141
  gr.Markdown(
142
  """
143
  # πŸ“„ Chat with PDF using AI
144
+ Upload a PDF document and ask questions about its content - No API key required!
145
 
146
  **Instructions:**
147
+ 1. Upload a PDF file
148
+ 2. Click "Process PDF" and wait for confirmation
149
+ 3. Start asking questions about your document!
 
150
  """
151
  )
152
 
153
  with gr.Row():
154
  with gr.Column(scale=1):
 
 
 
 
 
155
  pdf_input = gr.File(
156
+ label="πŸ“Ž Upload PDF",
157
+ file_types=[".pdf"],
158
+ type="filepath"
159
  )
160
+ process_btn = gr.Button("πŸ”„ Process PDF", variant="primary", size="lg")
161
  status_output = gr.Textbox(
162
+ label="πŸ“Š Status",
163
+ interactive=False,
164
+ lines=3
165
+ )
166
+
167
+ gr.Markdown(
168
+ """
169
+ ### πŸ’‘ Tips:
170
+ - Processing may take 30-60 seconds
171
+ - Ask specific questions about the content
172
+ - You can ask follow-up questions
173
+ - Best with text-based PDFs (not scanned images)
174
+ """
175
  )
176
 
177
  with gr.Column(scale=2):
178
  chatbot = gr.Chatbot(
179
+ label="πŸ’¬ Chat History",
180
+ height=500,
181
+ bubble_full_width=False
 
 
 
 
182
  )
183
  with gr.Row():
184
+ msg = gr.Textbox(
185
+ label="Your Question",
186
+ placeholder="Ask a question about your PDF...",
187
+ lines=2,
188
+ scale=4
189
+ )
190
+ with gr.Row():
191
+ submit_btn = gr.Button("πŸ“€ Send", variant="primary", scale=1)
192
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", scale=1)
193
 
194
  gr.Markdown(
195
  """
196
+ ---
197
+ ### πŸ”Œ API Access
198
+ Once deployed on Hugging Face Spaces, you can access this via API:
199
+ ```python
200
+ # Python example
201
+ from gradio_client import Client
202
+
203
+ client = Client("YOUR_USERNAME/YOUR_SPACE_NAME")
204
+
205
+ # Process PDF
206
+ result = client.predict("path/to/file.pdf", api_name="/process_pdf")
207
+
208
+ # Ask questions
209
+ result = client.predict("What is this document about?", [], api_name="/chat")
210
+ ```
211
+
212
+ ```javascript
213
+ // JavaScript example
214
+ const response = await fetch("https://YOUR_USERNAME-YOUR_SPACE_NAME.hf.space/api/predict", {
215
+ method: "POST",
216
+ headers: { "Content-Type": "application/json" },
217
+ body: JSON.stringify({
218
+ data: ["What is this document about?", []]
219
+ })
220
+ });
221
+ ```
222
  """
223
  )
224
 
225
  # Event handlers
226
  process_btn.click(
227
  fn=process_pdf,
228
+ inputs=[pdf_input],
229
+ outputs=[status_output, chatbot, msg]
230
  )
231
 
232
+ msg.submit(
233
  fn=chat,
234
  inputs=[msg, chatbot],
235
+ outputs=[chatbot]
236
  ).then(
237
+ fn=lambda: "",
238
+ outputs=[msg]
 
239
  )
240
 
241
+ submit_btn.click(
242
  fn=chat,
243
  inputs=[msg, chatbot],
244
+ outputs=[chatbot]
245
  ).then(
246
+ fn=lambda: "",
247
+ outputs=[msg]
 
248
  )
249
 
250
  clear_btn.click(
 
252
  outputs=[chatbot]
253
  )
254
 
255
+ # Initialize model on startup
256
+ initialize_llm()
257
+
258
  if __name__ == "__main__":
259
+ demo.launch(share=False)