Refat81 commited on
Commit
c8d262e
·
verified ·
1 Parent(s): 9d82598

Delete let_deploy.py

Browse files
Files changed (1) hide show
  1. let_deploy.py +0 -453
let_deploy.py DELETED
@@ -1,453 +0,0 @@
1
- # let_deploy.py
2
- import streamlit as st
3
- import time
4
- from bs4 import BeautifulSoup
5
- from langchain_text_splitters import CharacterTextSplitter
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- from langchain.schema import Document
11
- from selenium import webdriver
12
- from selenium.webdriver.common.by import By
13
- from selenium.webdriver.support.ui import WebDriverWait
14
- from selenium.webdriver.support import expected_conditions as EC
15
- from selenium.webdriver.chrome.options import Options
16
- from selenium.webdriver.chrome.service import Service
17
- from webdriver_manager.chrome import ChromeDriverManager
18
- from langchain_community.llms import HuggingFaceHub
19
- import re
20
- import requests
21
- import os
22
- from datetime import datetime
23
- from typing import List
24
- import logging
25
-
26
- logging.basicConfig(level=logging.INFO)
27
- logger = logging.getLogger(__name__)
28
-
29
- st.set_page_config(page_title="Facebook Extractor 2.0", page_icon="📘", layout="wide")
30
-
31
- st.markdown("""
32
- <style>
33
- .stApp { background-color: #0e1117; color: white; }
34
- .main-header { background: linear-gradient(135deg, #FF6B35, #FF8E53); color: white; padding: 1.5rem; border-radius: 8px; margin-bottom: 1.5rem; text-align: center; }
35
- .stButton>button { background-color: #1877F2; color: white; border: none; border-radius: 4px; padding: 8px 16px; width: 100%; }
36
- </style>
37
- """, unsafe_allow_html=True)
38
-
39
- def get_embeddings():
40
- try:
41
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
42
- return embeddings
43
- except Exception as e:
44
- st.error(f"❌ Failed to load embeddings: {e}")
45
- return None
46
-
47
- def get_llm():
48
- api_key = st.session_state.get('hf_api_key')
49
- if not api_key:
50
- st.error("❌ HuggingFace API Key not found")
51
- return None
52
-
53
- try:
54
- llm = HuggingFaceHub(
55
- repo_id="google/flan-t5-large",
56
- huggingfacehub_api_token=api_key,
57
- model_kwargs={"temperature": 0.7, "max_length": 512}
58
- )
59
- return llm
60
- except Exception as e:
61
- st.error(f"❌ HuggingFace error: {e}")
62
- return None
63
-
64
- class FacebookGroupExtractor:
65
- def __init__(self):
66
- self.driver = None
67
- self.wait = None
68
- self.is_logged_in = False
69
-
70
- def setup_driver(self):
71
- try:
72
- chrome_options = Options()
73
- chrome_options.add_argument("--no-sandbox")
74
- chrome_options.add_argument("--disable-dev-shm-usage")
75
- chrome_options.add_argument("--disable-gpu")
76
- chrome_options.add_argument("--start-maximized")
77
- chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
78
-
79
- st.info("🔄 Setting up Chrome browser...")
80
- try:
81
- service = Service(ChromeDriverManager().install())
82
- self.driver = webdriver.Chrome(service=service, options=chrome_options)
83
- except Exception as e:
84
- self.driver = webdriver.Chrome(options=chrome_options)
85
-
86
- self.driver.set_page_load_timeout(30)
87
- self.wait = WebDriverWait(self.driver, 25)
88
- st.success("✅ Chrome browser setup completed!")
89
- return True
90
- except Exception as e:
91
- st.error(f"❌ Failed to setup Chrome: {str(e)}")
92
- return False
93
-
94
- def manual_login(self):
95
- try:
96
- st.info("🔓 Opening Facebook for manual login...")
97
- self.driver.get("https://www.facebook.com")
98
- time.sleep(3)
99
- self.wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
100
- st.success("✅ Facebook opened successfully!")
101
- st.info("""
102
- **📝 Manual Login Instructions:**
103
- 1. Browser window opened with Facebook
104
- 2. Manually login to your account
105
- 3. Complete any security checks
106
- 4. Return here and click 'I'm Logged In'
107
- """)
108
- return True
109
- except Exception as e:
110
- st.error(f"❌ Failed to open Facebook: {str(e)}")
111
- return False
112
-
113
- def check_login_status(self):
114
- try:
115
- current_url = self.driver.current_url.lower()
116
- login_success_urls = ["facebook.com/home", "facebook.com/groups", "facebook.com/marketplace"]
117
- if any(url in current_url for url in login_success_urls):
118
- self.is_logged_in = True
119
- return True
120
-
121
- login_indicators = ["//a[@aria-label='Profile']", "//div[@aria-label='Account']", "//span[contains(text(), 'Menu')]"]
122
- for indicator in login_indicators:
123
- try:
124
- elements = self.driver.find_elements(By.XPATH, indicator)
125
- for element in elements:
126
- if element.is_displayed():
127
- self.is_logged_in = True
128
- return True
129
- except:
130
- continue
131
- return False
132
- except Exception as e:
133
- logger.error(f"Login check error: {str(e)}")
134
- return False
135
-
136
- def extract_group_data(self, group_url: str, max_scrolls: int = 10) -> dict:
137
- try:
138
- if not self.is_logged_in:
139
- return {"error": "Not logged in. Please login first.", "status": "error"}
140
-
141
- st.info(f"🌐 Accessing group: {group_url}")
142
- self.driver.get(group_url)
143
- time.sleep(5)
144
-
145
- # Extract group info
146
- group_info = self._extract_group_info()
147
- posts_data = self._scroll_and_extract_posts(max_scrolls)
148
-
149
- return {
150
- "group_info": group_info,
151
- "posts": posts_data,
152
- "extraction_time": datetime.now().isoformat(),
153
- "total_posts": len(posts_data),
154
- "status": "success"
155
- }
156
- except Exception as e:
157
- logger.error(f"Extraction error: {str(e)}")
158
- return {"error": f"Extraction failed: {str(e)}", "status": "error"}
159
-
160
- def _extract_group_info(self) -> dict:
161
- group_info = {}
162
- try:
163
- name_selectors = ["//h1", "//h2", "//h3", "//title"]
164
- for selector in name_selectors:
165
- try:
166
- elements = self.driver.find_elements(By.XPATH, selector)
167
- for element in elements:
168
- name = element.text.strip()
169
- if name and len(name) > 3:
170
- group_info["name"] = name
171
- break
172
- if "name" in group_info:
173
- break
174
- except:
175
- continue
176
- except Exception as e:
177
- logger.warning(f"Group info extraction failed: {str(e)}")
178
- return group_info
179
-
180
- def _scroll_and_extract_posts(self, max_scrolls: int) -> List[dict]:
181
- all_posts = []
182
- last_height = self.driver.execute_script("return document.body.scrollHeight")
183
-
184
- for scroll_iteration in range(max_scrolls):
185
- current_posts = self._extract_posts_from_current_page()
186
- for post in current_posts:
187
- if not self._is_duplicate_post(post, all_posts):
188
- all_posts.append(post)
189
-
190
- self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
191
- time.sleep(3)
192
-
193
- new_height = self.driver.execute_script("return document.body.scrollHeight")
194
- if new_height == last_height:
195
- break
196
- last_height = new_height
197
-
198
- return all_posts
199
-
200
- def _extract_posts_from_current_page(self) -> List[dict]:
201
- posts = []
202
- strategies = [
203
- ("//div[@role='article']", "article"),
204
- ("//div[contains(@data-pagelet, 'Feed')]//div", "feed"),
205
- ("//div[contains(@class, 'userContent')]", "userContent")
206
- ]
207
-
208
- for xpath, source in strategies:
209
- posts.extend(self._extract_by_xpath(xpath, source))
210
-
211
- return posts
212
-
213
- def _extract_by_xpath(self, xpath: str, source: str) -> List[dict]:
214
- posts = []
215
- try:
216
- elements = self.driver.find_elements(By.XPATH, xpath)
217
- for element in elements:
218
- try:
219
- post_text = element.text.strip()
220
- if self._is_valid_post(post_text):
221
- post_data = {
222
- "content": post_text,
223
- "source": source,
224
- "timestamp": datetime.now().isoformat(),
225
- "has_comments": False,
226
- "reactions": 0
227
- }
228
- posts.append(post_data)
229
- except:
230
- continue
231
- except:
232
- pass
233
- return posts
234
-
235
- def _is_valid_post(self, text: str) -> bool:
236
- if not text or len(text) < 30:
237
- return False
238
- excluded_phrases = ['facebook', 'login', 'sign up', 'password', 'menu', 'navigation']
239
- text_lower = text.lower()
240
- if any(phrase in text_lower for phrase in excluded_phrases):
241
- return False
242
- words = text.split()
243
- return len(words) >= 5
244
-
245
- def _is_duplicate_post(self, new_post: dict, existing_posts: List[dict]) -> bool:
246
- new_content = new_post.get("content", "")[:100]
247
- for existing_post in existing_posts:
248
- existing_content = existing_post.get("content", "")[:100]
249
- similarity = self._calculate_similarity(new_content, existing_content)
250
- if similarity > 0.7:
251
- return True
252
- return False
253
-
254
- def _calculate_similarity(self, text1: str, text2: str) -> float:
255
- if not text1 or not text2:
256
- return 0.0
257
- words1 = set(text1.lower().split())
258
- words2 = set(text2.lower().split())
259
- if not words1 or not words2:
260
- return 0.0
261
- intersection = words1.intersection(words2)
262
- union = words1.union(words2)
263
- return len(intersection) / len(union) if union else 0.0
264
-
265
- def close(self):
266
- if self.driver:
267
- try:
268
- self.driver.quit()
269
- except:
270
- pass
271
-
272
- def process_group_data(group_data: dict):
273
- if not group_data or "posts" not in group_data or not group_data["posts"]:
274
- return None, []
275
-
276
- all_text = f"Group: {group_data.get('group_info', {}).get('name', 'Unknown')}\n\n"
277
- all_text += f"Total Posts: {len(group_data['posts'])}\n\n"
278
-
279
- for i, post in enumerate(group_data["posts"]):
280
- content = post.get("content", "")
281
- all_text += f"--- Post {i+1} ---\n"
282
- all_text += f"Content: {content}\n\n"
283
-
284
- splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
285
- chunks = splitter.split_text(all_text)
286
- documents = [Document(page_content=chunk) for chunk in chunks]
287
-
288
- try:
289
- embeddings = get_embeddings()
290
- if embeddings is None:
291
- return None, []
292
- vectorstore = FAISS.from_documents(documents, embeddings)
293
- return vectorstore, chunks
294
- except Exception as e:
295
- st.error(f"Vector store creation failed: {e}")
296
- return None, []
297
-
298
- def create_chatbot(vectorstore):
299
- try:
300
- llm = get_llm()
301
- if llm is None:
302
- return None
303
-
304
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
305
- chain = ConversationalRetrievalChain.from_llm(
306
- llm=llm,
307
- retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
308
- memory=memory,
309
- return_source_documents=True
310
- )
311
- return chain
312
- except Exception as e:
313
- st.error(f"Failed to create chatbot: {str(e)}")
314
- return None
315
-
316
- def main():
317
- st.markdown("""
318
- <div class="main-header">
319
- <h1>🔥 Facebook Group Extractor 2.0</h1>
320
- <p>Professional Version - Powered by HuggingFace</p>
321
- </div>
322
- """, unsafe_allow_html=True)
323
-
324
- if st.button("← Back to Main Dashboard", use_container_width=True):
325
- st.info("Return to main dashboard")
326
- return
327
-
328
- if not st.session_state.get('hf_api_key'):
329
- st.error("❌ API Key not configured. Please go back to main dashboard.")
330
- return
331
-
332
- # Initialize session state
333
- if "extractor" not in st.session_state:
334
- st.session_state.extractor = None
335
- if "login_status" not in st.session_state:
336
- st.session_state.login_status = "not_started"
337
- if "group_data" not in st.session_state:
338
- st.session_state.group_data = None
339
- if "chatbot" not in st.session_state:
340
- st.session_state.chatbot = None
341
- if "chat_history" not in st.session_state:
342
- st.session_state.chat_history = []
343
-
344
- # Sidebar
345
- with st.sidebar:
346
- st.success("✅ HuggingFace API Active")
347
-
348
- # Login section
349
- st.subheader("🔐 Facebook Login")
350
-
351
- if st.session_state.login_status == "not_started":
352
- if st.button("🚪 Start Manual Login", type="primary", use_container_width=True):
353
- with st.spinner("Setting up browser..."):
354
- extractor = FacebookGroupExtractor()
355
- if extractor.setup_driver():
356
- st.session_state.extractor = extractor
357
- if extractor.manual_login():
358
- st.session_state.login_status = "in_progress"
359
- st.rerun()
360
-
361
- elif st.session_state.login_status == "in_progress":
362
- st.info("🔄 Login in progress...")
363
- col1, col2 = st.columns(2)
364
- with col1:
365
- if st.button("✅ I'm Logged In", type="primary"):
366
- if st.session_state.extractor and st.session_state.extractor.check_login_status():
367
- st.session_state.login_status = "completed"
368
- st.success("✅ Login successful!")
369
- st.rerun()
370
- with col2:
371
- if st.button("❌ Cancel"):
372
- if st.session_state.extractor:
373
- st.session_state.extractor.close()
374
- st.session_state.login_status = "not_started"
375
- st.rerun()
376
-
377
- elif st.session_state.login_status == "completed":
378
- st.success("✅ Logged in to Facebook")
379
-
380
- # Group extraction
381
- st.subheader("📝 Group Information")
382
- group_url = st.text_input("Facebook Group URL", placeholder="https://www.facebook.com/groups/groupname/")
383
- max_scrolls = st.slider("Number of scrolls", 5, 20, 10)
384
-
385
- if st.button("🚀 Extract Group Data", type="primary", use_container_width=True):
386
- if st.session_state.login_status != "completed":
387
- st.error("❌ Please login to Facebook first")
388
- elif not group_url or "facebook.com/groups/" not in group_url:
389
- st.error("❌ Please enter a valid Facebook group URL")
390
- else:
391
- with st.spinner("🌐 Extracting group data..."):
392
- group_data = st.session_state.extractor.extract_group_data(group_url, max_scrolls)
393
- if group_data.get("status") == "success":
394
- st.session_state.group_data = group_data
395
- vectorstore, chunks = process_group_data(group_data)
396
- if vectorstore:
397
- st.session_state.chatbot = create_chatbot(vectorstore)
398
- st.session_state.chat_history = []
399
- st.success(f"✅ Successfully extracted {len(group_data['posts'])} posts!")
400
-
401
- # Main content
402
- col1, col2 = st.columns([1, 1])
403
-
404
- with col1:
405
- st.header("📊 Status")
406
-
407
- if st.session_state.login_status == "not_started":
408
- st.info("🔐 Start manual login to begin")
409
- elif st.session_state.login_status == "in_progress":
410
- st.warning("🔄 Complete login in the browser")
411
- elif st.session_state.login_status == "completed":
412
- st.success("✅ Ready to extract group data")
413
-
414
- if st.session_state.group_data:
415
- group_info = st.session_state.group_data.get("group_info", {})
416
- posts = st.session_state.group_data.get("posts", [])
417
-
418
- st.subheader("🏷️ Group Info")
419
- if group_info.get("name"):
420
- st.write(f"**Name:** {group_info['name']}")
421
- st.write(f"**Posts Extracted:** {len(posts)}")
422
-
423
- with col2:
424
- st.header("💬 Chat")
425
-
426
- if st.session_state.chatbot and st.session_state.group_data:
427
- for i, chat in enumerate(st.session_state.chat_history):
428
- with st.chat_message("user"):
429
- st.write(chat["question"])
430
- with st.chat_message("assistant"):
431
- st.write(chat["answer"])
432
-
433
- user_question = st.chat_input("Ask about the group...")
434
- if user_question:
435
- with st.chat_message("user"):
436
- st.write(user_question)
437
- with st.chat_message("assistant"):
438
- with st.spinner("🤔 Analyzing..."):
439
- try:
440
- response = st.session_state.chatbot.invoke({"question": user_question})
441
- answer = response.get("answer", "No response generated.")
442
- st.write(answer)
443
- st.session_state.chat_history.append({
444
- "question": user_question,
445
- "answer": answer
446
- })
447
- except Exception as e:
448
- st.error(f"Error: {str(e)}")
449
- else:
450
- st.info("📊 Extract group data first to start chatting")
451
-
452
- if __name__ == "__main__":
453
- main()