Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21

Commit

89d85c0

verified ·

1 Parent(s): b1d820a

Update pages/linkedin_extractor.py

Browse files

Files changed (1) hide show

pages/linkedin_extractor.py +0 -216

pages/linkedin_extractor.py CHANGED Viewed

@@ -1,216 +0,0 @@
-# pages/linkedin_extractor.py
-import streamlit as st
-import requests
-from bs4 import BeautifulSoup
-from langchain_text_splitters import CharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import ConversationalRetrievalChain
-from langchain_core.documents import Document
-from langchain_community.llms import HuggingFaceHub
-import re
-import time
-import os
-st.set_page_config(
-    page_title="LinkedIn AI Analyzer",
-    page_icon="💼",
-    layout="wide"
-)
-def get_embeddings():
-    try:
-        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-        return embeddings
-    except Exception as e:
-        st.error(f"❌ Failed to load embeddings: {e}")
-        return None
-def get_llm():
-    try:
-        api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-        if not api_key:
-            st.error("❌ HuggingFace API Key not found in environment variables")
-            return None
-        llm = HuggingFaceHub(
-            repo_id="google/flan-t5-large",
-            huggingfacehub_api_token=api_key,
-            model_kwargs={"temperature": 0.7, "max_length": 500}
-        )
-        return llm
-    except Exception as e:
-        st.error(f"❌ HuggingFace error: {e}")
-        return None
-def extract_linkedin_data(url, data_type):
-    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-        }
-        response = requests.get(url, headers=headers, timeout=15)
-        if response.status_code != 200:
-            return f"❌ Failed to access page (Status: {response.status_code})"
-        soup = BeautifulSoup(response.text, 'html.parser')
-        for script in soup(["script", "style"]):
-            script.decompose()
-        text = soup.get_text()
-        lines = (line.strip() for line in text.splitlines())
-        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        text = ' '.join(chunk for chunk in chunks if chunk)
-        paragraphs = text.split('.')
-        meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
-        if not meaningful_content:
-            return "❌ No meaningful content found."
-        result = f"🔗 URL: {url}\n"
-        result += "="*50 + "\n\n"
-        for i, content in enumerate(meaningful_content[:10], 1):
-            result += f"{i}. {content}\n\n"
-        result += "="*50 + "\n"
-        result += f"✅ Extracted {len(meaningful_content)} content blocks\n"
-        return result
-    except Exception as e:
-        return f"❌ Error: {str(e)}"
-def get_text_chunks(text):
-    if not text.strip():
-        return []
-    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
-    return splitter.split_text(text)
-def get_vectorstore(text_chunks):
-    if not text_chunks:
-        return None
-    documents = [Document(page_content=chunk) for chunk in text_chunks]
-    embeddings = get_embeddings()
-    if embeddings is None:
-        return None
-    vectorstore = FAISS.from_documents(documents, embeddings)
-    return vectorstore
-def get_conversation_chain(vectorstore):
-    if vectorstore is None:
-        return None
-    try:
-        llm = get_llm()
-        if llm is None:
-            return None
-        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
-        chain = ConversationalRetrievalChain.from_llm(
-            llm=llm,
-            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
-            memory=memory,
-            return_source_documents=True
-        )
-        return chain
-    except Exception as e:
-        st.error(f"❌ Error: {e}")
-        return None
-def main():
-    st.title("💼 LinkedIn AI Analyzer")
-    if st.button("← Back to Main Dashboard"):
-        st.switch_page("app.py")
-    # Initialize session state
-    if "conversation" not in st.session_state:
-        st.session_state.conversation = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = []
-    if "processed" not in st.session_state:
-        st.session_state.processed = False
-    if "extracted_data" not in st.session_state:
-        st.session_state.extracted_data = ""
-    # Sidebar
-    with st.sidebar:
-        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
-        url_placeholder = {
-            "profile": "https://www.linkedin.com/in/username/",
-            "company": "https://www.linkedin.com/company/companyname/",
-            "post": "https://www.linkedin.com/posts/username_postid/"
-        }
-        linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
-        if st.button("🚀 Extract & Analyze", type="primary"):
-            if not linkedin_url.strip():
-                st.warning("Please enter a LinkedIn URL")
-            else:
-                with st.spinner("🔄 Extracting data..."):
-                    extracted_data = extract_linkedin_data(linkedin_url, data_type)
-                    if extracted_data and not extracted_data.startswith("❌"):
-                        chunks = get_text_chunks(extracted_data)
-                        if chunks:
-                            vectorstore = get_vectorstore(chunks)
-                            conversation = get_conversation_chain(vectorstore)
-                            if conversation:
-                                st.session_state.conversation = conversation
-                                st.session_state.processed = True
-                                st.session_state.extracted_data = extracted_data
-                                st.session_state.chat_history = []
-                                st.success(f"✅ Ready to analyze {len(chunks)} content chunks!")
-                            else:
-                                st.error("❌ Failed to initialize AI")
-                        else:
-                            st.error("❌ No content extracted")
-                    else:
-                        st.error(extracted_data)
-    # Main content
-    col1, col2 = st.columns([2, 1])
-    with col1:
-        st.markdown("### 💬 Chat")
-        for i, chat in enumerate(st.session_state.chat_history):
-            if chat["role"] == "user":
-                st.markdown(f"**👤 You:** {chat['content']}")
-            elif chat["role"] == "assistant":
-                if chat["content"]:
-                    st.markdown(f"**🤖 Assistant:** {chat['content']}")
-        if st.session_state.processed:
-            user_input = st.chat_input("Ask about the LinkedIn data...")
-            if user_input:
-                st.session_state.chat_history.append({"role": "user", "content": user_input})
-                with st.spinner("🤔 Analyzing..."):
-                    try:
-                        if st.session_state.conversation:
-                            response = st.session_state.conversation.invoke({"question": user_input})
-                            answer = response.get("answer", "No response generated.")
-                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
-                            st.rerun()
-                    except Exception as e:
-                        st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
-                        st.rerun()
-        else:
-            st.info("👋 Enter a LinkedIn URL and click 'Extract & Analyze' to start")
-    with col2:
-        if st.session_state.processed:
-            st.markdown("### 📊 Overview")
-            data = st.session_state.extracted_data
-            chunks = get_text_chunks(data)
-            st.metric("Content Type", data_type.title())
-            st.metric("Text Chunks", len(chunks))
-            st.metric("Characters", f"{len(data):,}")
-if __name__ == "__main__":
-    main()