Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21

Commit

2238e58

verified ·

1 Parent(s): 247c0ad

Create facebook_extractor_pro.py

Browse files

Files changed (1) hide show

pages/facebook_extractor_pro.py +581 -0

pages/facebook_extractor_pro.py ADDED Viewed

	@@ -0,0 +1,581 @@

+# let_deploy.py
+import streamlit as st
+import time
+from bs4 import BeautifulSoup
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from langchain_core.documents import Document
+from langchain_community.llms import HuggingFaceHub
+import re
+import requests
+import os
+from datetime import datetime
+from typing import List
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Page configuration
+st.set_page_config(
+    page_title="Facebook Extractor 2.0",
+    page_icon="📘",
+    layout="wide"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .stApp {
+        background-color: #0e1117;
+        color: white;
+    }
+    .main-header {
+        background: linear-gradient(135deg, #FF6B35, #FF8E53);
+        color: white;
+        padding: 2rem;
+        border-radius: 10px;
+        margin-bottom: 2rem;
+        text-align: center;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .feature-card {
+        background: #1e1e1e;
+        padding: 1.5rem;
+        border-radius: 10px;
+        border-left: 4px solid #FF6B35;
+        margin: 1rem 0;
+    }
+    .status-indicator {
+        padding: 0.5rem 1rem;
+        border-radius: 20px;
+        font-weight: bold;
+        text-align: center;
+    }
+    .status-success {
+        background: #10b981;
+        color: white;
+    }
+    .status-warning {
+        background: #f59e0b;
+        color: white;
+    }
+    .status-error {
+        background: #ef4444;
+        color: white;
+    }
+    .chat-message {
+        padding: 1rem;
+        border-radius: 10px;
+        margin: 0.5rem 0;
+    }
+    .user-message {
+        background: #1e40af;
+        color: white;
+    }
+    .assistant-message {
+        background: #374151;
+        color: white;
+    }
+</style>
+""", unsafe_allow_html=True)
+def get_embeddings():
+    """Initialize HuggingFace embeddings"""
+    try:
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        return embeddings
+    except Exception as e:
+        st.error(f"❌ Failed to load embeddings: {e}")
+        return None
+def get_llm():
+    """Initialize HuggingFace LLM"""
+    try:
+        api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+        if not api_key:
+            st.error("❌ HuggingFace API Key not found in environment variables")
+            return None
+        llm = HuggingFaceHub(
+            repo_id="google/flan-t5-large",
+            huggingfacehub_api_token=api_key,
+            model_kwargs={
+                "temperature": 0.7,
+                "max_length": 512,
+                "top_p": 0.9,
+                "top_k": 50
+            }
+        )
+        return llm
+    except Exception as e:
+        st.error(f"❌ HuggingFace error: {e}")
+        return None
+class FacebookDataExtractor:
+    """Enhanced Facebook data extractor using requests only"""
+    def __init__(self):
+        self.session = requests.Session()
+        self.setup_session()
+    def setup_session(self):
+        """Setup requests session with headers"""
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+        })
+    def extract_public_data(self, url: str, data_type: str) -> dict:
+        """Extract public data from Facebook URLs"""
+        try:
+            st.info(f"🌐 Accessing: {url}")
+            response = self.session.get(url, timeout=15)
+            if response.status_code != 200:
+                return {
+                    "error": f"Failed to access page (Status: {response.status_code})",
+                    "status": "error"
+                }
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove scripts and styles
+            for script in soup(["script", "style", "meta", "link"]):
+                script.decompose()
+            # Extract meaningful text
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            clean_text = ' '.join(chunk for chunk in chunks if chunk)
+            # Extract page title
+            title = soup.find('title')
+            page_title = title.text.strip() if title else "Unknown"
+            # Extract meta description
+            meta_desc = soup.find('meta', attrs={'name': 'description'})
+            description = meta_desc['content'] if meta_desc else ""
+            # Extract Open Graph data
+            og_title = soup.find('meta', property='og:title')
+            og_description = soup.find('meta', property='og:description')
+            og_image = soup.find('meta', property='og:image')
+            # Structure the extracted data
+            extracted_data = {
+                "page_info": {
+                    "title": page_title,
+                    "description": description,
+                    "og_title": og_title['content'] if og_title else "",
+                    "og_description": og_description['content'] if og_description else "",
+                    "og_image": og_image['content'] if og_image else "",
+                    "url": url
+                },
+                "content_blocks": self._extract_content_blocks(clean_text),
+                "extraction_time": datetime.now().isoformat(),
+                "data_type": data_type,
+                "status": "success"
+            }
+            return extracted_data
+        except requests.exceptions.Timeout:
+            return {"error": "Request timed out", "status": "error"}
+        except requests.exceptions.ConnectionError:
+            return {"error": "Connection failed", "status": "error"}
+        except Exception as e:
+            logger.error(f"Extraction error: {str(e)}")
+            return {"error": f"Extraction failed: {str(e)}", "status": "error"}
+    def _extract_content_blocks(self, text: str) -> List[dict]:
+        """Extract meaningful content blocks from text"""
+        blocks = []
+        # Split into paragraphs/sentences
+        paragraphs = [p.strip() for p in text.split('.') if p.strip()]
+        for i, paragraph in enumerate(paragraphs[:20]):  # Limit to first 20 paragraphs
+            if len(paragraph) > 30:  # Only include substantial content
+                block = {
+                    "id": i + 1,
+                    "content": paragraph,
+                    "length": len(paragraph),
+                    "word_count": len(paragraph.split())
+                }
+                blocks.append(block)
+        return blocks
+    def analyze_facebook_url(self, url: str) -> str:
+        """Analyze Facebook URL and return structured information"""
+        url_lower = url.lower()
+        if 'groups' in url_lower:
+            return "Facebook Group (Limited access - requires login)"
+        elif 'pages' in url_lower:
+            return "Facebook Page (Public data accessible)"
+        elif 'events' in url_lower:
+            return "Facebook Event (Limited access)"
+        elif 'profile' in url_lower or 'user' in url_lower:
+            return "Facebook Profile (Limited access - requires login)"
+        else:
+            return "Facebook Content (General)"
+def process_extracted_data(extracted_data: dict):
+    """Process extracted data for chatbot"""
+    if not extracted_data or extracted_data.get("status") != "success":
+        return None, []
+    # Combine all content into a single text
+    all_text = f"Page Title: {extracted_data['page_info']['title']}\n\n"
+    if extracted_data['page_info']['description']:
+        all_text += f"Description: {extracted_data['page_info']['description']}\n\n"
+    if extracted_data['page_info']['og_description']:
+        all_text += f"OpenGraph Description: {extracted_data['page_info']['og_description']}\n\n"
+    all_text += f"Data Type: {extracted_data['data_type']}\n"
+    all_text += f"Extraction Time: {extracted_data['extraction_time']}\n"
+    all_text += f"Content Blocks: {len(extracted_data['content_blocks'])}\n\n"
+    # Add content blocks
+    for i, block in enumerate(extracted_data['content_blocks']):
+        all_text += f"--- Content Block {i+1} ---\n"
+        all_text += f"Words: {block['word_count']} | Characters: {block['length']}\n"
+        all_text += f"Content: {block['content']}\n\n"
+    # Split into chunks
+    splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=800,
+        chunk_overlap=150,
+        length_function=len
+    )
+    chunks = splitter.split_text(all_text)
+    documents = [Document(page_content=chunk) for chunk in chunks]
+    # Create vector store
+    try:
+        embeddings = get_embeddings()
+        if embeddings is None:
+            return None, []
+        vectorstore = FAISS.from_documents(documents, embeddings)
+        return vectorstore, chunks
+    except Exception as e:
+        st.error(f"Vector store creation failed: {e}")
+        return None, []
+def create_chatbot(vectorstore):
+    """Create conversational chatbot"""
+    try:
+        llm = get_llm()
+        if llm is None:
+            return None
+        memory = ConversationBufferMemory(
+            memory_key="chat_history",
+            return_messages=True,
+            output_key="answer"
+        )
+        chain = ConversationalRetrievalChain.from_llm(
+            llm=llm,
+            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
+            memory=memory,
+            return_source_documents=True,
+            output_key="answer"
+        )
+        return chain
+    except Exception as e:
+        st.error(f"Failed to create chatbot: {str(e)}")
+        return None
+def clear_chat_history():
+    """Clear chat history while keeping extracted data"""
+    if "vectorstore" in st.session_state and st.session_state.vectorstore:
+        st.session_state.chatbot = create_chatbot(st.session_state.vectorstore)
+        st.session_state.chat_history = []
+        st.success("🔄 Chat history cleared! Starting fresh conversation.")
+def display_status_indicator(status: str, message: str):
+    """Display status indicator"""
+    status_class = {
+        "success": "status-success",
+        "warning": "status-warning",
+        "error": "status-error"
+    }.get(status, "status-warning")
+    st.markdown(f"""
+    <div class="status-indicator {status_class}">
+        {message}
+    </div>
+    """, unsafe_allow_html=True)
+def main():
+    """Main application function"""
+    # Header
+    st.markdown("""
+    <div class="main-header">
+        <h1 style="margin:0; font-size: 2.5rem;">🔥 Facebook Extractor 2.0</h1>
+        <p style="margin:0; opacity: 0.9; font-size: 1.2rem;">Enhanced Version - AI-Powered Analysis</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Navigation
+    col1, col2 = st.columns([1, 4])
+    with col1:
+        if st.button("← Back to Main", use_container_width=True):
+            st.switch_page("app.py")
+    # Check API key
+    if not os.getenv('HUGGINGFACEHUB_API_TOKEN'):
+        st.error("""
+        ❌ HuggingFace API Key not configured!
+        Please add your API key to Hugging Face Space settings:
+        1. Go to your Space Settings
+        2. Click "Repository Secrets"
+        3. Add: `HUGGINGFACEHUB_API_TOKEN = "your_token_here"`
+        4. Restart the Space
+        """)
+        return
+    # Initialize session state
+    if "extractor" not in st.session_state:
+        st.session_state.extractor = FacebookDataExtractor()
+    if "extracted_data" not in st.session_state:
+        st.session_state.extracted_data = None
+    if "vectorstore" not in st.session_state:
+        st.session_state.vectorstore = None
+    if "chatbot" not in st.session_state:
+        st.session_state.chatbot = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    if "processing" not in st.session_state:
+        st.session_state.processing = False
+    # Sidebar
+    with st.sidebar:
+        st.markdown("### ⚙️ Configuration")
+        # URL input
+        st.subheader("🔗 Facebook URL")
+        facebook_url = st.text_input(
+            "Enter Facebook URL",
+            placeholder="https://www.facebook.com/username/...",
+            help="Enter public Facebook URL (pages work best)"
+        )
+        # Data type selection
+        data_type = st.selectbox(
+            "Content Type",
+            ["page", "group", "profile", "event", "post"],
+            help="Select the type of Facebook content"
+        )
+        # Extraction settings
+        st.subheader("🔧 Settings")
+        analyze_depth = st.select_slider(
+            "Analysis Depth",
+            options=["Basic", "Standard", "Detailed"],
+            value="Standard"
+        )
+        # Extract button
+        if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
+            if not facebook_url.strip():
+                st.warning("⚠️ Please enter a Facebook URL")
+            elif not facebook_url.startswith('https://www.facebook.com/'):
+                st.error("❌ Please enter a valid Facebook URL")
+            else:
+                st.session_state.processing = True
+                with st.spinner("🔄 Extracting data from Facebook..."):
+                    extracted_data = st.session_state.extractor.extract_public_data(facebook_url, data_type)
+                    if extracted_data.get("status") == "success":
+                        st.session_state.extracted_data = extracted_data
+                        # Process for chatbot
+                        vectorstore, chunks = process_extracted_data(extracted_data)
+                        if vectorstore:
+                            st.session_state.vectorstore = vectorstore
+                            st.session_state.chatbot = create_chatbot(vectorstore)
+                            st.session_state.chat_history = []
+                            st.success(f"✅ Successfully processed {len(chunks)} content chunks!")
+                        else:
+                            st.error("❌ Failed to process extracted data")
+                    else:
+                        error_msg = extracted_data.get("error", "Unknown error occurred")
+                        st.error(f"❌ Extraction failed: {error_msg}")
+                st.session_state.processing = False
+        # Chat management
+        if st.session_state.chatbot and st.session_state.extracted_data:
+            st.markdown("---")
+            st.subheader("💬 Chat Management")
+            if st.button("🗑️ Clear Chat History", type="secondary", use_container_width=True):
+                clear_chat_history()
+    # Main content area
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.markdown("### 📊 Extraction Results")
+        if st.session_state.processing:
+            display_status_indicator("warning", "🔄 Processing...")
+            st.info("Extracting data from Facebook. This may take a few seconds.")
+        elif st.session_state.extracted_data:
+            data = st.session_state.extracted_data
+            page_info = data['page_info']
+            content_blocks = data['content_blocks']
+            display_status_indicator("success", "✅ Extraction Complete")
+            # Display page info
+            st.markdown("#### 🏷️ Page Information")
+            st.write(f"**Title:** {page_info['title']}")
+            if page_info['description']:
+                st.write(f"**Description:** {page_info['description'][:200]}...")
+            if page_info['og_description']:
+                st.write(f"**OG Description:** {page_info['og_description'][:200]}...")
+            st.write(f"**URL:** {page_info['url']}")
+            st.write(f"**Data Type:** {data['data_type'].title()}")
+            st.write(f"**Content Blocks:** {len(content_blocks)}")
+            st.write(f"**Extraction Time:** {data['extraction_time'][:19]}")
+            # Display sample content
+            st.markdown("#### 📝 Sample Content")
+            for i, block in enumerate(content_blocks[:3]):
+                with st.expander(f"Content Block {i+1} ({block['word_count']} words)"):
+                    st.write(block['content'])
+            if len(content_blocks) > 3:
+                st.info(f"📄 And {len(content_blocks) - 3} more content blocks...")
+        else:
+            display_status_indicator("warning", "⏳ Ready for Extraction")
+            st.info("""
+            **To get started:**
+            1. Enter a Facebook URL in the sidebar
+            2. Select content type
+            3. Click "Extract & Analyze"
+            **Supported URLs:**
+            - 🏢 Facebook Pages (best results)
+            - 📘 Public Groups (limited)
+            - 👤 Public Profiles (limited)
+            - 🎉 Events (limited)
+            - 📝 Posts (limited)
+            **Note:** This version extracts public data only.
+            Private content requires manual login (available in local deployment).
+            """)
+    with col2:
+        st.markdown("### 💬 AI Analysis")
+        if st.session_state.chatbot and st.session_state.extracted_data:
+            # Display chat history
+            for i, chat in enumerate(st.session_state.chat_history):
+                if chat["role"] == "user":
+                    st.markdown(f'<div class="chat-message user-message"><strong>👤 You:</strong> {chat["content"]}</div>',
+                               unsafe_allow_html=True)
+                elif chat["role"] == "assistant":
+                    st.markdown(f'<div class="chat-message assistant-message"><strong>🤖 Assistant:</strong> {chat["content"]}</div>',
+                               unsafe_allow_html=True)
+            # Chat input
+            user_input = st.chat_input("Ask about the Facebook data...")
+            if user_input:
+                # Add user message
+                st.session_state.chat_history.append({"role": "user", "content": user_input})
+                # Generate AI response
+                with st.spinner("🤔 Analyzing..."):
+                    try:
+                        response = st.session_state.chatbot.invoke({"question": user_input})
+                        answer = response.get("answer", "I couldn't generate a response based on the available data.")
+                        st.session_state.chat_history.append({"role": "assistant", "content": answer})
+                        st.rerun()
+                    except Exception as e:
+                        error_msg = f"❌ Error generating response: {str(e)}"
+                        st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
+                        st.rerun()
+            # Suggested questions
+            if not st.session_state.chat_history:
+                st.markdown("#### 💡 Suggested Questions")
+                suggestions = [
+                    "Summarize the main content of this page",
+                    "What is this page primarily about?",
+                    "Extract key information from the content",
+                    "What are the main topics discussed?",
+                    "Provide an overview of this Facebook content"
+                ]
+                for suggestion in suggestions:
+                    if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
+                        st.info(f"💡 Try asking: '{suggestion}'")
+        elif st.session_state.extracted_data:
+            st.info("💬 Extract data first to start chatting with AI")
+        else:
+            st.info("🔍 Extract Facebook data to enable AI analysis")
+    # Features section
+    st.markdown("---")
+    st.markdown("### 🚀 Enhanced Features")
+    feature_cols = st.columns(3)
+    with feature_cols[0]:
+        st.markdown("""
+        <div class="feature-card">
+            <h4>🔍 Smart Extraction</h4>
+            <p>Advanced algorithms for better content recognition and structure analysis</p>
+        </div>
+        """, unsafe_allow_html=True)
+    with feature_cols[1]:
+        st.markdown("""
+        <div class="feature-card">
+            <h4>🤖 AI-Powered Analysis</h4>
+            <p>HuggingFace integration for intelligent content understanding and Q&A</p>
+        </div>
+        """, unsafe_allow_html=True)
+    with feature_cols[2]:
+        st.markdown("""
+        <div class="feature-card">
+            <h4>💬 Contextual Memory</h4>
+            <p>Maintains conversation context for more meaningful interactions</p>
+        </div>
+        """, unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()