Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

File size: 8,190 Bytes

f3a65d7
 
 
6af20e8
 
 
1a8b1d8
 
6af20e8
 
f3a65d7

import streamlit as st
import requests
from bs4 import BeautifulSoup
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.documents import Document
from langchain_community.llms import HuggingFaceHub
import re
import time
import os

st.set_page_config(
    page_title="LinkedIn AI Analyzer",
    page_icon="💼",
    layout="wide"
)

def get_embeddings():
    try:
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        return embeddings
    except Exception as e:
        st.error(f"❌ Failed to load embeddings: {e}")
        return None

def get_llm():
    try:
        api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
        if not api_key:
            st.error("❌ HuggingFace API Key not found in environment variables")
            return None
        
        llm = HuggingFaceHub(
            repo_id="google/flan-t5-large",
            huggingfacehub_api_token=api_key,
            model_kwargs={"temperature": 0.7, "max_length": 500}
        )
        return llm
    except Exception as e:
        st.error(f"❌ HuggingFace error: {e}")
        return None

def extract_linkedin_data(url, data_type):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            return f"❌ Failed to access page (Status: {response.status_code})"
        
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(["script", "style"]):
            script.decompose()
        
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        paragraphs = text.split('.')
        meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
        
        if not meaningful_content:
            return "❌ No meaningful content found."
        
        result = f"🔗 URL: {url}\n"
        result += "="*50 + "\n\n"
        
        for i, content in enumerate(meaningful_content[:10], 1):
            result += f"{i}. {content}\n\n"
        
        result += "="*50 + "\n"
        result += f"✅ Extracted {len(meaningful_content)} content blocks\n"
        
        return result
        
    except Exception as e:
        return f"❌ Error: {str(e)}"

def get_text_chunks(text):
    if not text.strip():
        return []
    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
    return splitter.split_text(text)

def get_vectorstore(text_chunks):
    if not text_chunks:
        return None
    documents = [Document(page_content=chunk) for chunk in text_chunks]
    embeddings = get_embeddings()
    if embeddings is None:
        return None
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

def get_conversation_chain(vectorstore):
    if vectorstore is None:
        return None
    try:
        llm = get_llm()
        if llm is None:
            return None
        
        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
            memory=memory,
            return_source_documents=True
        )
        return chain
    except Exception as e:
        st.error(f"❌ Error: {e}")
        return None

def main():
    st.title("💼 LinkedIn AI Analyzer")
    
    if st.button("← Back to Main Dashboard"):
        st.switch_page("app.py")
    
    # Initialize session state
    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []
    if "processed" not in st.session_state:
        st.session_state.processed = False
    if "extracted_data" not in st.session_state:
        st.session_state.extracted_data = ""
    
    # Sidebar
    with st.sidebar:
        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
        
        url_placeholder = {
            "profile": "https://www.linkedin.com/in/username/",
            "company": "https://www.linkedin.com/company/companyname/", 
            "post": "https://www.linkedin.com/posts/username_postid/"
        }
        
        linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
        
        if st.button("🚀 Extract & Analyze", type="primary"):
            if not linkedin_url.strip():
                st.warning("Please enter a LinkedIn URL")
            else:
                with st.spinner("🔄 Extracting data..."):
                    extracted_data = extract_linkedin_data(linkedin_url, data_type)
                    
                    if extracted_data and not extracted_data.startswith("❌"):
                        chunks = get_text_chunks(extracted_data)
                        if chunks:
                            vectorstore = get_vectorstore(chunks)
                            conversation = get_conversation_chain(vectorstore)
                            if conversation:
                                st.session_state.conversation = conversation
                                st.session_state.processed = True
                                st.session_state.extracted_data = extracted_data
                                st.session_state.chat_history = []
                                st.success(f"✅ Ready to analyze {len(chunks)} content chunks!")
                            else:
                                st.error("❌ Failed to initialize AI")
                        else:
                            st.error("❌ No content extracted")
                    else:
                        st.error(extracted_data)
    
    # Main content
    col1, col2 = st.columns([2, 1])
    
    with col1:
        st.markdown("### 💬 Chat")
        
        for i, chat in enumerate(st.session_state.chat_history):
            if chat["role"] == "user":
                st.markdown(f"**👤 You:** {chat['content']}")
            elif chat["role"] == "assistant":
                if chat["content"]:
                    st.markdown(f"**🤖 Assistant:** {chat['content']}")
        
        if st.session_state.processed:
            user_input = st.chat_input("Ask about the LinkedIn data...")
            if user_input:
                st.session_state.chat_history.append({"role": "user", "content": user_input})
                with st.spinner("🤔 Analyzing..."):
                    try:
                        if st.session_state.conversation:
                            response = st.session_state.conversation.invoke({"question": user_input})
                            answer = response.get("answer", "No response generated.")
                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
                            st.rerun()
                    except Exception as e:
                        st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
                        st.rerun()
        else:
            st.info("👋 Enter a LinkedIn URL and click 'Extract & Analyze' to start")
    
    with col2:
        if st.session_state.processed:
            st.markdown("### 📊 Overview")
            data = st.session_state.extracted_data
            chunks = get_text_chunks(data)
            
            st.metric("Content Type", data_type.title())
            st.metric("Text Chunks", len(chunks))
            st.metric("Characters", f"{len(data):,}")

if __name__ == "__main__":
    main()