Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21

Commit

f3a65d7

verified ·

1 Parent(s): 89d85c0

Update pages/linkedin_extractor.py

Browse files

Files changed (1) hide show

pages/linkedin_extractor.py +216 -0

pages/linkedin_extractor.py CHANGED Viewed

	@@ -0,0 +1,216 @@

+# pages/linkedin_extractor.py
+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from langchain_core.documents import Document
+from langchain_community.llms import HuggingFaceHub
+import re
+import time
+import os
+st.set_page_config(
+    page_title="LinkedIn AI Analyzer",
+    page_icon="💼",
+    layout="wide"
+)
+def get_embeddings():
+    try:
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        return embeddings
+    except Exception as e:
+        st.error(f"❌ Failed to load embeddings: {e}")
+        return None
+def get_llm():
+    try:
+        api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+        if not api_key:
+            st.error("❌ HuggingFace API Key not found in environment variables")
+            return None
+        llm = HuggingFaceHub(
+            repo_id="google/flan-t5-large",
+            huggingfacehub_api_token=api_key,
+            model_kwargs={"temperature": 0.7, "max_length": 500}
+        )
+        return llm
+    except Exception as e:
+        st.error(f"❌ HuggingFace error: {e}")
+        return None
+def extract_linkedin_data(url, data_type):
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=15)
+        if response.status_code != 200:
+            return f"❌ Failed to access page (Status: {response.status_code})"
+        soup = BeautifulSoup(response.text, 'html.parser')
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        paragraphs = text.split('.')
+        meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
+        if not meaningful_content:
+            return "❌ No meaningful content found."
+        result = f"🔗 URL: {url}\n"
+        result += "="*50 + "\n\n"
+        for i, content in enumerate(meaningful_content[:10], 1):
+            result += f"{i}. {content}\n\n"
+        result += "="*50 + "\n"
+        result += f"✅ Extracted {len(meaningful_content)} content blocks\n"
+        return result
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+def get_text_chunks(text):
+    if not text.strip():
+        return []
+    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
+    return splitter.split_text(text)
+def get_vectorstore(text_chunks):
+    if not text_chunks:
+        return None
+    documents = [Document(page_content=chunk) for chunk in text_chunks]
+    embeddings = get_embeddings()
+    if embeddings is None:
+        return None
+    vectorstore = FAISS.from_documents(documents, embeddings)
+    return vectorstore
+def get_conversation_chain(vectorstore):
+    if vectorstore is None:
+        return None
+    try:
+        llm = get_llm()
+        if llm is None:
+            return None
+        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+        chain = ConversationalRetrievalChain.from_llm(
+            llm=llm,
+            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
+            memory=memory,
+            return_source_documents=True
+        )
+        return chain
+    except Exception as e:
+        st.error(f"❌ Error: {e}")
+        return None
+def main():
+    st.title("💼 LinkedIn AI Analyzer")
+    if st.button("← Back to Main Dashboard"):
+        st.switch_page("app.py")
+    # Initialize session state
+    if "conversation" not in st.session_state:
+        st.session_state.conversation = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    if "processed" not in st.session_state:
+        st.session_state.processed = False
+    if "extracted_data" not in st.session_state:
+        st.session_state.extracted_data = ""
+    # Sidebar
+    with st.sidebar:
+        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
+        url_placeholder = {
+            "profile": "https://www.linkedin.com/in/username/",
+            "company": "https://www.linkedin.com/company/companyname/",
+            "post": "https://www.linkedin.com/posts/username_postid/"
+        }
+        linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
+        if st.button("🚀 Extract & Analyze", type="primary"):
+            if not linkedin_url.strip():
+                st.warning("Please enter a LinkedIn URL")
+            else:
+                with st.spinner("🔄 Extracting data..."):
+                    extracted_data = extract_linkedin_data(linkedin_url, data_type)
+                    if extracted_data and not extracted_data.startswith("❌"):
+                        chunks = get_text_chunks(extracted_data)
+                        if chunks:
+                            vectorstore = get_vectorstore(chunks)
+                            conversation = get_conversation_chain(vectorstore)
+                            if conversation:
+                                st.session_state.conversation = conversation
+                                st.session_state.processed = True
+                                st.session_state.extracted_data = extracted_data
+                                st.session_state.chat_history = []
+                                st.success(f"✅ Ready to analyze {len(chunks)} content chunks!")
+                            else:
+                                st.error("❌ Failed to initialize AI")
+                        else:
+                            st.error("❌ No content extracted")
+                    else:
+                        st.error(extracted_data)
+    # Main content
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown("### 💬 Chat")
+        for i, chat in enumerate(st.session_state.chat_history):
+            if chat["role"] == "user":
+                st.markdown(f"**👤 You:** {chat['content']}")
+            elif chat["role"] == "assistant":
+                if chat["content"]:
+                    st.markdown(f"**🤖 Assistant:** {chat['content']}")
+        if st.session_state.processed:
+            user_input = st.chat_input("Ask about the LinkedIn data...")
+            if user_input:
+                st.session_state.chat_history.append({"role": "user", "content": user_input})
+                with st.spinner("🤔 Analyzing..."):
+                    try:
+                        if st.session_state.conversation:
+                            response = st.session_state.conversation.invoke({"question": user_input})
+                            answer = response.get("answer", "No response generated.")
+                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
+                            st.rerun()
+                    except Exception as e:
+                        st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
+                        st.rerun()
+        else:
+            st.info("👋 Enter a LinkedIn URL and click 'Extract & Analyze' to start")
+    with col2:
+        if st.session_state.processed:
+            st.markdown("### 📊 Overview")
+            data = st.session_state.extracted_data
+            chunks = get_text_chunks(data)
+            st.metric("Content Type", data_type.title())
+            st.metric("Text Chunks", len(chunks))
+            st.metric("Characters", f"{len(data):,}")
+if __name__ == "__main__":
+    main()