Refat81 commited on
Commit
a72a934
Β·
verified Β·
1 Parent(s): 069aef5

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +294 -190
pages/linkedin_extractor.py CHANGED
@@ -20,19 +20,36 @@ st.set_page_config(
20
  )
21
 
22
  def get_embeddings():
23
- """Initialize HuggingFace embeddings with fallback"""
24
  try:
25
- embeddings = HuggingFaceEmbeddings(
26
- model_name="sentence-transformers/all-MiniLM-L6-v2"
27
- )
28
- return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
- st.error(f"❌ Failed to load embeddings: {e}")
31
- st.info("πŸ”§ Please make sure 'sentence-transformers' is in requirements.txt")
32
  return None
33
 
34
  def get_llm():
35
- """Initialize HuggingFace LLM"""
36
  try:
37
  api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
38
  if not api_key:
@@ -43,40 +60,55 @@ def get_llm():
43
  1. Go to Space Settings β†’ Variables and Secrets
44
  2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"
45
  3. Restart the Space
 
 
46
  """)
47
  return None
48
 
 
49
  llm = HuggingFaceHub(
50
- repo_id="google/flan-t5-large",
51
  huggingfacehub_api_token=api_key,
52
  model_kwargs={
53
  "temperature": 0.7,
54
- "max_length": 512,
55
- "max_new_tokens": 256
 
 
 
56
  }
57
  )
58
  return llm
59
  except Exception as e:
60
- st.error(f"❌ HuggingFace error: {e}")
61
  return None
62
 
63
  def extract_linkedin_data(url, data_type):
64
  """Extract data from LinkedIn URLs"""
65
  try:
66
  headers = {
67
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
 
 
 
 
 
 
68
  }
69
 
70
  st.info(f"🌐 Accessing: {url}")
71
- response = requests.get(url, headers=headers, timeout=20)
72
 
73
  if response.status_code != 200:
74
- return f"❌ Failed to access page (Status: {response.status_code})"
 
 
 
75
 
76
  soup = BeautifulSoup(response.text, 'html.parser')
77
 
78
  # Remove scripts and styles
79
- for script in soup(["script", "style", "meta", "link"]):
80
  script.decompose()
81
 
82
  # Extract and clean text
@@ -86,75 +118,96 @@ def extract_linkedin_data(url, data_type):
86
  clean_text = ' '.join(chunk for chunk in chunks if chunk)
87
 
88
  # Extract meaningful content
89
- paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
90
 
91
  if not paragraphs:
92
- return "❌ No meaningful content found. The page might require login."
93
-
94
- # Structure the result
95
- result = f"πŸ”— LINKEDIN DATA EXTRACTION\n"
96
- result += "=" * 60 + "\n\n"
97
- result += f"πŸ“„ URL: {url}\n"
98
- result += f"πŸ“Š Type: {data_type.upper()}\n"
99
- result += f"⏰ Extracted: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
100
- result += f"πŸ“ Content Blocks: {len(paragraphs)}\n"
101
- result += "=" * 60 + "\n\n"
102
 
103
- # Add extracted content
104
- for i, content in enumerate(paragraphs[:15], 1):
105
- result += f"πŸ“„ Block {i}:\n"
106
- result += f"{content}\n"
107
- result += "-" * 40 + "\n\n"
108
 
109
- result += "=" * 60 + "\n"
110
- result += f"βœ… Successfully extracted {len(paragraphs)} content blocks\n"
111
- result += f"πŸ“Š Total characters: {len(clean_text):,}\n"
 
 
 
 
 
 
 
 
 
 
112
 
113
- return result
114
 
115
  except requests.exceptions.Timeout:
116
- return "❌ Error: Request timed out. Please try again."
117
  except requests.exceptions.ConnectionError:
118
- return "❌ Error: Connection failed. Please check the URL."
119
  except Exception as e:
120
- return f"❌ Error: {str(e)}"
121
 
122
- def get_text_chunks(text):
123
- """Split text into chunks"""
124
- if not text.strip():
125
- return []
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  splitter = CharacterTextSplitter(
128
  separator="\n",
129
- chunk_size=800,
130
- chunk_overlap=150,
131
  length_function=len
132
  )
133
- return splitter.split_text(text)
134
-
135
- def get_vectorstore(text_chunks):
136
- """Create vector store from text chunks"""
137
- if not text_chunks:
138
- return None
139
 
 
 
 
 
140
  try:
141
- documents = [Document(page_content=chunk) for chunk in text_chunks]
142
  embeddings = get_embeddings()
143
-
144
  if embeddings is None:
145
- return None
146
-
147
  vectorstore = FAISS.from_documents(documents, embeddings)
148
- return vectorstore
149
  except Exception as e:
150
- st.error(f"❌ Vector store creation failed: {e}")
151
- return None
152
 
153
- def get_conversation_chain(vectorstore):
154
- """Create conversational chain"""
155
- if vectorstore is None:
156
- return None
157
-
158
  try:
159
  llm = get_llm()
160
  if llm is None:
@@ -168,53 +221,63 @@ def get_conversation_chain(vectorstore):
168
 
169
  chain = ConversationalRetrievalChain.from_llm(
170
  llm=llm,
171
- retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
172
  memory=memory,
173
  return_source_documents=True,
174
  output_key="answer"
175
  )
176
  return chain
177
  except Exception as e:
178
- st.error(f"❌ Conversation chain error: {e}")
179
  return None
180
 
181
  def clear_chat_history():
182
  """Clear chat history while keeping extracted data"""
183
  if "vectorstore" in st.session_state and st.session_state.vectorstore:
184
- st.session_state.chatbot = get_conversation_chain(st.session_state.vectorstore)
185
  st.session_state.chat_history = []
186
  st.success("πŸ”„ Chat history cleared! Starting fresh conversation.")
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  def main():
189
  st.title("πŸ’Ό LinkedIn AI Analyzer")
190
 
191
  if st.button("← Back to Main Dashboard"):
192
  st.switch_page("app.py")
193
 
194
- # Check API key
195
- if not os.getenv('HUGGINGFACEHUB_API_TOKEN'):
196
- st.error("""
197
- πŸ”‘ **HuggingFace API Key Required**
198
-
199
- To enable AI features:
200
- 1. Go to **Space Settings** β†’ **Variables and Secrets**
201
- 2. Add: `HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"`
202
- 3. **Restart** the Space
203
-
204
- Get free API key from: https://huggingface.co/settings/tokens
205
- """)
206
-
207
  # Initialize session state
208
- if "conversation" not in st.session_state:
209
- st.session_state.conversation = None
210
- if "chat_history" not in st.session_state:
211
- st.session_state.chat_history = []
212
- if "processed" not in st.session_state:
213
- st.session_state.processed = False
214
  if "extracted_data" not in st.session_state:
215
- st.session_state.extracted_data = ""
216
  if "vectorstore" not in st.session_state:
217
  st.session_state.vectorstore = None
 
 
 
 
 
 
218
  if "current_url" not in st.session_state:
219
  st.session_state.current_url = ""
220
 
@@ -229,7 +292,7 @@ def main():
229
  help="Select the type of LinkedIn content"
230
  )
231
 
232
- # URL input with examples
233
  url_placeholder = {
234
  "profile": "https://www.linkedin.com/in/username/",
235
  "company": "https://www.linkedin.com/company/companyname/",
@@ -243,11 +306,12 @@ def main():
243
  )
244
 
245
  # Suggested URLs
246
- st.markdown("### πŸ’‘ Try These:")
247
  suggested_urls = {
248
  "Microsoft": "https://www.linkedin.com/company/microsoft/",
249
  "Google": "https://www.linkedin.com/company/google/",
250
- "Apple": "https://www.linkedin.com/company/apple/"
 
251
  }
252
 
253
  for name, url in suggested_urls.items():
@@ -256,145 +320,185 @@ def main():
256
  st.rerun()
257
 
258
  # Extract button
259
- col1, col2 = st.columns(2)
260
- with col1:
261
- if st.button("πŸš€ Extract & Analyze", type="primary", use_container_width=True):
262
- url_to_use = linkedin_url.strip() or st.session_state.current_url
263
-
264
- if not url_to_use:
265
- st.warning("⚠️ Please enter a LinkedIn URL")
266
- elif not url_to_use.startswith('https://www.linkedin.com/'):
267
- st.error("❌ Please enter a valid LinkedIn URL")
268
- else:
269
- with st.spinner("πŸ”„ Extracting data from LinkedIn..."):
270
- extracted_data = extract_linkedin_data(url_to_use, data_type)
 
 
 
271
 
272
- if extracted_data and not extracted_data.startswith("❌"):
273
- # Process for AI
274
- chunks = get_text_chunks(extracted_data)
275
- if chunks:
276
- vectorstore = get_vectorstore(chunks)
277
- conversation = get_conversation_chain(vectorstore)
278
-
279
- if conversation:
280
- st.session_state.conversation = conversation
281
- st.session_state.vectorstore = vectorstore
282
- st.session_state.processed = True
283
- st.session_state.extracted_data = extracted_data
284
- st.session_state.chat_history = []
285
- st.session_state.current_url = url_to_use
286
- st.success(f"βœ… Ready to analyze {len(chunks)} content chunks!")
287
- else:
288
- st.error("❌ Failed to initialize AI")
289
- else:
290
- st.error("❌ No content extracted")
291
  else:
292
- st.error(extracted_data)
293
-
294
- with col2:
295
- if st.session_state.processed:
296
- if st.button("πŸ—‘οΈ Clear Chat", type="secondary", use_container_width=True):
297
- clear_chat_history()
298
 
299
- # Display extraction info
300
- if st.session_state.processed:
301
  st.markdown("---")
302
- st.markdown("### πŸ“Š Extraction Info")
303
- st.write(f"**Type:** {data_type.title()}")
304
- st.write(f"**URL:** {st.session_state.current_url[:50]}...")
305
- if st.session_state.extracted_data:
306
- chunks = get_text_chunks(st.session_state.extracted_data)
307
- st.write(f"**Chunks:** {len(chunks)}")
308
- st.write(f"**Characters:** {len(st.session_state.extracted_data):,}")
309
 
310
  # Main content area
311
- col1, col2 = st.columns([2, 1])
312
 
313
  with col1:
314
- st.markdown("### πŸ’¬ AI Conversation")
315
 
316
- # Display chat history
317
- for i, chat in enumerate(st.session_state.chat_history):
318
- if chat["role"] == "user":
319
- with st.chat_message("user"):
320
- st.write(chat["content"])
321
- elif chat["role"] == "assistant":
322
- with st.chat_message("assistant"):
323
- st.write(chat["content"])
324
 
325
- # Chat input
326
- if st.session_state.processed and st.session_state.conversation:
327
- user_input = st.chat_input("Ask about the LinkedIn data...")
 
328
 
329
- if user_input:
330
- # Add user message
331
- st.session_state.chat_history.append({"role": "user", "content": user_input})
332
-
333
- with st.chat_message("user"):
334
- st.write(user_input)
335
-
336
- # Generate AI response
337
- with st.chat_message("assistant"):
338
- with st.spinner("πŸ€” Analyzing..."):
339
- try:
340
- response = st.session_state.conversation.invoke({"question": user_input})
341
- answer = response.get("answer", "I couldn't generate a response based on the available data.")
342
-
343
- st.write(answer)
344
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
345
- except Exception as e:
346
- error_msg = f"❌ Error generating response: {str(e)}"
347
- st.write(error_msg)
348
- st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
 
349
 
350
- elif st.session_state.processed:
351
- st.info("πŸ’¬ Extract data first to start chatting with AI")
352
  else:
353
  st.info("""
354
  πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
355
 
 
 
356
  **To get started:**
357
- 1. Select content type in sidebar
358
  2. Enter a LinkedIn URL or click a suggested company
359
  3. Click "Extract & Analyze"
360
  4. Chat with AI about the extracted content
361
 
362
  **Supported URLs:**
363
- - πŸ‘€ Profiles: `https://www.linkedin.com/in/username/`
364
- - 🏒 Companies: `https://www.linkedin.com/company/companyname/`
365
- - πŸ“ Posts: `https://www.linkedin.com/posts/username_postid/`
366
 
367
- **Note:** Only public profiles and content are accessible.
 
 
 
 
368
  """)
369
 
370
  with col2:
371
- st.markdown("### πŸ“ˆ Analytics")
372
 
373
- if st.session_state.processed:
374
- data = st.session_state.extracted_data
375
- chunks = get_text_chunks(data)
 
 
 
 
376
 
377
- st.metric("Content Type", data_type.title())
378
- st.metric("Content Chunks", len(chunks))
379
- st.metric("Total Characters", f"{len(data):,}")
380
- st.metric("Conversation Turns", len(st.session_state.chat_history) // 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
  # Suggested questions
383
  if not st.session_state.chat_history:
384
- st.markdown("### πŸ’‘ Suggested Questions")
385
  suggestions = [
386
- "Summarize the main information",
387
- "What are the key skills or experiences mentioned?",
388
- "Tell me about the company overview",
389
- "What's the main content of this page?",
390
- "Extract important achievements"
391
  ]
392
 
393
  for suggestion in suggestions:
394
  if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
395
  st.info(f"πŸ’‘ Try asking: '{suggestion}'")
 
 
 
396
  else:
397
- st.info("πŸ“Š Analytics will appear here after data extraction")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  if __name__ == "__main__":
400
  main()
 
20
  )
21
 
22
  def get_embeddings():
23
+ """Initialize embeddings with multiple fallback options"""
24
  try:
25
+ # Try multiple embedding models
26
+ model_options = [
27
+ "sentence-transformers/all-MiniLM-L6-v2", # Default
28
+ "sentence-transformers/paraphrase-albert-small-v2", # Smaller alternative
29
+ "sentence-transformers/all-mpnet-base-v2" # Higher quality
30
+ ]
31
+
32
+ for model_name in model_options:
33
+ try:
34
+ embeddings = HuggingFaceEmbeddings(
35
+ model_name=model_name,
36
+ model_kwargs={'device': 'cpu'},
37
+ encode_kwargs={'normalize_embeddings': True}
38
+ )
39
+ st.success(f"βœ… Loaded embeddings: {model_name.split('/')[-1]}")
40
+ return embeddings
41
+ except Exception as e:
42
+ continue
43
+
44
+ st.error("❌ All embedding models failed to load")
45
+ return None
46
+
47
  except Exception as e:
48
+ st.error(f"❌ Embeddings error: {e}")
 
49
  return None
50
 
51
  def get_llm():
52
+ """Initialize Mistral 7B LLM - Best for analysis"""
53
  try:
54
  api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
55
  if not api_key:
 
60
  1. Go to Space Settings β†’ Variables and Secrets
61
  2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"
62
  3. Restart the Space
63
+
64
+ Get free API key: https://huggingface.co/settings/tokens
65
  """)
66
  return None
67
 
68
+ # Using Mistral 7B - Best balance of quality and accessibility
69
  llm = HuggingFaceHub(
70
+ repo_id="mistralai/Mistral-7B-Instruct-v0.1",
71
  huggingfacehub_api_token=api_key,
72
  model_kwargs={
73
  "temperature": 0.7,
74
+ "max_length": 2048,
75
+ "max_new_tokens": 512,
76
+ "top_p": 0.95,
77
+ "repetition_penalty": 1.1,
78
+ "do_sample": True
79
  }
80
  )
81
  return llm
82
  except Exception as e:
83
+ st.error(f"❌ AI Model error: {e}")
84
  return None
85
 
86
  def extract_linkedin_data(url, data_type):
87
  """Extract data from LinkedIn URLs"""
88
  try:
89
  headers = {
90
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
91
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
92
+ 'Accept-Language': 'en-US,en;q=0.5',
93
+ 'Accept-Encoding': 'gzip, deflate, br',
94
+ 'DNT': '1',
95
+ 'Connection': 'keep-alive',
96
+ 'Upgrade-Insecure-Requests': '1',
97
  }
98
 
99
  st.info(f"🌐 Accessing: {url}")
100
+ response = requests.get(url, headers=headers, timeout=25)
101
 
102
  if response.status_code != 200:
103
+ return {
104
+ "error": f"Failed to access page (Status: {response.status_code})",
105
+ "status": "error"
106
+ }
107
 
108
  soup = BeautifulSoup(response.text, 'html.parser')
109
 
110
  # Remove scripts and styles
111
+ for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
112
  script.decompose()
113
 
114
  # Extract and clean text
 
118
  clean_text = ' '.join(chunk for chunk in chunks if chunk)
119
 
120
  # Extract meaningful content
121
+ paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 40]
122
 
123
  if not paragraphs:
124
+ return {
125
+ "error": "No meaningful content found. The page might require login or have restricted access.",
126
+ "status": "error"
127
+ }
 
 
 
 
 
 
128
 
129
+ # Extract page title
130
+ title = soup.find('title')
131
+ page_title = title.text.strip() if title else "LinkedIn Page"
 
 
132
 
133
+ # Structure the extracted data
134
+ extracted_data = {
135
+ "page_info": {
136
+ "title": page_title,
137
+ "url": url,
138
+ "response_code": response.status_code,
139
+ "content_length": len(clean_text)
140
+ },
141
+ "content_blocks": paragraphs,
142
+ "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
143
+ "data_type": data_type,
144
+ "status": "success"
145
+ }
146
 
147
+ return extracted_data
148
 
149
  except requests.exceptions.Timeout:
150
+ return {"error": "Request timed out. Please try again.", "status": "error"}
151
  except requests.exceptions.ConnectionError:
152
+ return {"error": "Connection failed. Please check the URL and try again.", "status": "error"}
153
  except Exception as e:
154
+ return {"error": f"Extraction error: {str(e)}", "status": "error"}
155
 
156
+ def process_extracted_data(extracted_data):
157
+ """Process extracted data for AI analysis"""
158
+ if not extracted_data or extracted_data.get("status") != "success":
159
+ return None, []
160
 
161
+ page_info = extracted_data['page_info']
162
+ content_blocks = extracted_data['content_blocks']
163
+
164
+ # Structure the data for AI
165
+ all_text = f"LINKEDIN DATA ANALYSIS REPORT\n"
166
+ all_text += "=" * 70 + "\n\n"
167
+ all_text += f"πŸ“„ PAGE INFORMATION:\n"
168
+ all_text += f"Title: {page_info['title']}\n"
169
+ all_text += f"URL: {page_info['url']}\n"
170
+ all_text += f"Type: {extracted_data['data_type'].upper()}\n"
171
+ all_text += f"Extracted: {extracted_data['extraction_time']}\n"
172
+ all_text += f"Response Code: {page_info['response_code']}\n"
173
+ all_text += f"Content Length: {page_info['content_length']} characters\n\n"
174
+
175
+ all_text += f"πŸ“Š CONTENT ANALYSIS:\n"
176
+ all_text += f"Total Content Blocks: {len(content_blocks)}\n\n"
177
+
178
+ # Add content blocks
179
+ for i, block in enumerate(content_blocks[:20]):
180
+ all_text += f"--- CONTENT BLOCK {i+1} ---\n"
181
+ all_text += f"Words: {len(block.split())} | Characters: {len(block)}\n"
182
+ all_text += f"Content: {block}\n\n"
183
+
184
+ all_text += "=" * 70 + "\n"
185
+ all_text += "END OF EXTRACTION REPORT"
186
+
187
+ # Split into chunks
188
  splitter = CharacterTextSplitter(
189
  separator="\n",
190
+ chunk_size=1000,
191
+ chunk_overlap=200,
192
  length_function=len
193
  )
 
 
 
 
 
 
194
 
195
+ chunks = splitter.split_text(all_text)
196
+ documents = [Document(page_content=chunk) for chunk in chunks]
197
+
198
+ # Create vector store
199
  try:
 
200
  embeddings = get_embeddings()
 
201
  if embeddings is None:
202
+ return None, []
 
203
  vectorstore = FAISS.from_documents(documents, embeddings)
204
+ return vectorstore, chunks
205
  except Exception as e:
206
+ st.error(f"Vector store creation failed: {e}")
207
+ return None, []
208
 
209
+ def create_chatbot(vectorstore):
210
+ """Create conversational chatbot with Mistral"""
 
 
 
211
  try:
212
  llm = get_llm()
213
  if llm is None:
 
221
 
222
  chain = ConversationalRetrievalChain.from_llm(
223
  llm=llm,
224
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
225
  memory=memory,
226
  return_source_documents=True,
227
  output_key="answer"
228
  )
229
  return chain
230
  except Exception as e:
231
+ st.error(f"Failed to create chatbot: {str(e)}")
232
  return None
233
 
234
  def clear_chat_history():
235
  """Clear chat history while keeping extracted data"""
236
  if "vectorstore" in st.session_state and st.session_state.vectorstore:
237
+ st.session_state.chatbot = create_chatbot(st.session_state.vectorstore)
238
  st.session_state.chat_history = []
239
  st.success("πŸ”„ Chat history cleared! Starting fresh conversation.")
240
 
241
+ def display_metrics(extracted_data):
242
+ """Display extraction metrics"""
243
+ if not extracted_data:
244
+ return
245
+
246
+ page_info = extracted_data['page_info']
247
+ content_blocks = extracted_data['content_blocks']
248
+
249
+ col1, col2, col3, col4 = st.columns(4)
250
+
251
+ with col1:
252
+ st.metric("Content Blocks", len(content_blocks))
253
+
254
+ with col2:
255
+ total_words = sum(len(block.split()) for block in content_blocks)
256
+ st.metric("Total Words", total_words)
257
+
258
+ with col3:
259
+ st.metric("Characters", f"{page_info['content_length']:,}")
260
+
261
+ with col4:
262
+ st.metric("Response Code", page_info['response_code'])
263
+
264
  def main():
265
  st.title("πŸ’Ό LinkedIn AI Analyzer")
266
 
267
  if st.button("← Back to Main Dashboard"):
268
  st.switch_page("app.py")
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  # Initialize session state
 
 
 
 
 
 
271
  if "extracted_data" not in st.session_state:
272
+ st.session_state.extracted_data = None
273
  if "vectorstore" not in st.session_state:
274
  st.session_state.vectorstore = None
275
+ if "chatbot" not in st.session_state:
276
+ st.session_state.chatbot = None
277
+ if "chat_history" not in st.session_state:
278
+ st.session_state.chat_history = []
279
+ if "processing" not in st.session_state:
280
+ st.session_state.processing = False
281
  if "current_url" not in st.session_state:
282
  st.session_state.current_url = ""
283
 
 
292
  help="Select the type of LinkedIn content"
293
  )
294
 
295
+ # URL input
296
  url_placeholder = {
297
  "profile": "https://www.linkedin.com/in/username/",
298
  "company": "https://www.linkedin.com/company/companyname/",
 
306
  )
307
 
308
  # Suggested URLs
309
+ st.markdown("### πŸš€ Quick Test")
310
  suggested_urls = {
311
  "Microsoft": "https://www.linkedin.com/company/microsoft/",
312
  "Google": "https://www.linkedin.com/company/google/",
313
+ "Apple": "https://www.linkedin.com/company/apple/",
314
+ "Amazon": "https://www.linkedin.com/company/amazon/"
315
  }
316
 
317
  for name, url in suggested_urls.items():
 
320
  st.rerun()
321
 
322
  # Extract button
323
+ if st.button("πŸš€ Extract & Analyze", type="primary", use_container_width=True):
324
+ url_to_use = linkedin_url.strip() or st.session_state.current_url
325
+
326
+ if not url_to_use:
327
+ st.warning("⚠️ Please enter a LinkedIn URL")
328
+ elif not url_to_use.startswith('https://www.linkedin.com/'):
329
+ st.error("❌ Please enter a valid LinkedIn URL")
330
+ else:
331
+ st.session_state.processing = True
332
+ with st.spinner("πŸ”„ Extracting and analyzing data..."):
333
+ extracted_data = extract_linkedin_data(url_to_use, data_type)
334
+
335
+ if extracted_data.get("status") == "success":
336
+ st.session_state.extracted_data = extracted_data
337
+ st.session_state.current_url = url_to_use
338
 
339
+ # Process for AI
340
+ vectorstore, chunks = process_extracted_data(extracted_data)
341
+ if vectorstore:
342
+ st.session_state.vectorstore = vectorstore
343
+ st.session_state.chatbot = create_chatbot(vectorstore)
344
+ st.session_state.chat_history = []
345
+ st.success(f"βœ… Successfully processed {len(chunks)} content chunks!")
346
+ st.balloons()
 
 
 
 
 
 
 
 
 
 
 
347
  else:
348
+ st.error("❌ Failed to process data for AI analysis")
349
+ else:
350
+ error_msg = extracted_data.get("error", "Unknown error occurred")
351
+ st.error(f"❌ Extraction failed: {error_msg}")
352
+
353
+ st.session_state.processing = False
354
 
355
+ # Chat management
356
+ if st.session_state.chatbot and st.session_state.extracted_data:
357
  st.markdown("---")
358
+ st.subheader("πŸ’¬ Chat Management")
359
+ if st.button("πŸ—‘οΈ Clear Chat History", type="secondary", use_container_width=True):
360
+ clear_chat_history()
 
 
 
 
361
 
362
  # Main content area
363
+ col1, col2 = st.columns([1, 1])
364
 
365
  with col1:
366
+ st.markdown("### πŸ“Š Extraction Results")
367
 
368
+ if st.session_state.processing:
369
+ st.info("πŸ”„ Processing LinkedIn data...")
 
 
 
 
 
 
370
 
371
+ elif st.session_state.extracted_data:
372
+ data = st.session_state.extracted_data
373
+ page_info = data['page_info']
374
+ content_blocks = data['content_blocks']
375
 
376
+ st.success("βœ… Extraction Complete")
377
+
378
+ # Display metrics
379
+ display_metrics(data)
380
+
381
+ # Display page info
382
+ st.markdown("#### 🏷️ Page Information")
383
+ st.write(f"**Title:** {page_info['title']}")
384
+ st.write(f"**URL:** {page_info['url']}")
385
+ st.write(f"**Data Type:** {data['data_type'].title()}")
386
+ st.write(f"**Content Blocks:** {len(content_blocks)}")
387
+ st.write(f"**Extraction Time:** {data['extraction_time']}")
388
+
389
+ # Display sample content
390
+ st.markdown("#### πŸ“ Sample Content")
391
+ for i, block in enumerate(content_blocks[:3]):
392
+ with st.expander(f"Content Block {i+1} ({len(block.split())} words)"):
393
+ st.write(block)
394
+
395
+ if len(content_blocks) > 3:
396
+ st.info(f"πŸ“„ And {len(content_blocks) - 3} more content blocks...")
397
 
 
 
398
  else:
399
  st.info("""
400
  πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
401
 
402
+ **Powered by Mistral 7B AI**
403
+
404
  **To get started:**
405
+ 1. Select content type
406
  2. Enter a LinkedIn URL or click a suggested company
407
  3. Click "Extract & Analyze"
408
  4. Chat with AI about the extracted content
409
 
410
  **Supported URLs:**
411
+ - πŸ‘€ Public Profiles
412
+ - 🏒 Company Pages
413
+ - πŸ“ Public Posts
414
 
415
+ **AI Features:**
416
+ - Smart content analysis
417
+ - Conversational chat
418
+ - Data insights
419
+ - Content summarization
420
  """)
421
 
422
  with col2:
423
+ st.markdown("### πŸ’¬ AI Chat Analysis")
424
 
425
+ if st.session_state.chatbot and st.session_state.extracted_data:
426
+ # Display chat history
427
+ for i, chat in enumerate(st.session_state.chat_history):
428
+ if chat["role"] == "user":
429
+ st.markdown(f"**πŸ‘€ You:** {chat['content']}")
430
+ elif chat["role"] == "assistant":
431
+ st.markdown(f"**πŸ€– AI:** {chat['content']}")
432
 
433
+ # Chat input
434
+ user_input = st.chat_input("Ask about the LinkedIn data...")
435
+
436
+ if user_input:
437
+ # Add user message
438
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
439
+
440
+ # Generate AI response
441
+ with st.spinner("πŸ€” Mistral AI is analyzing..."):
442
+ try:
443
+ response = st.session_state.chatbot.invoke({"question": user_input})
444
+ answer = response.get("answer", "I couldn't generate a response based on the available data.")
445
+
446
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
447
+ st.rerun()
448
+ except Exception as e:
449
+ error_msg = f"❌ Error generating response: {str(e)}"
450
+ st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
451
+ st.rerun()
452
 
453
  # Suggested questions
454
  if not st.session_state.chat_history:
455
+ st.markdown("#### πŸ’‘ Suggested Questions")
456
  suggestions = [
457
+ "Summarize the main information from this page",
458
+ "What are the key highlights or achievements?",
459
+ "Analyze the business or professional focus",
460
+ "What insights can you extract from this content?",
461
+ "Provide a comprehensive overview"
462
  ]
463
 
464
  for suggestion in suggestions:
465
  if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
466
  st.info(f"πŸ’‘ Try asking: '{suggestion}'")
467
+
468
+ elif st.session_state.extracted_data:
469
+ st.info("πŸ’¬ Start a conversation with the AI assistant")
470
  else:
471
+ st.info("πŸ” Extract LinkedIn data to enable AI analysis")
472
+
473
+ # Features section
474
+ st.markdown("---")
475
+ st.markdown("### πŸš€ Powered by Mistral 7B AI")
476
+
477
+ feature_cols = st.columns(3)
478
+
479
+ with feature_cols[0]:
480
+ st.markdown("""
481
+ **πŸ€– Advanced AI**
482
+ - Mistral 7B Instruct model
483
+ - Intelligent text analysis
484
+ - Contextual understanding
485
+ """)
486
+
487
+ with feature_cols[1]:
488
+ st.markdown("""
489
+ **πŸ’¬ Smart Chat**
490
+ - Conversational memory
491
+ - Relevant responses
492
+ - Data-driven insights
493
+ """)
494
+
495
+ with feature_cols[2]:
496
+ st.markdown("""
497
+ **πŸ” Deep Analysis**
498
+ - Content summarization
499
+ - Pattern recognition
500
+ - Professional insights
501
+ """)
502
 
503
  if __name__ == "__main__":
504
  main()