Refat81's picture
Update pages/linkedin_extractor.py
6af20e8 verified
raw
history blame
8.19 kB
import streamlit as st
import requests
from bs4 import BeautifulSoup
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.documents import Document
from langchain_community.llms import HuggingFaceHub
import re
import time
import os
st.set_page_config(
page_title="LinkedIn AI Analyzer",
page_icon="πŸ’Ό",
layout="wide"
)
def get_embeddings():
try:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
return embeddings
except Exception as e:
st.error(f"❌ Failed to load embeddings: {e}")
return None
def get_llm():
try:
api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
if not api_key:
st.error("❌ HuggingFace API Key not found in environment variables")
return None
llm = HuggingFaceHub(
repo_id="google/flan-t5-large",
huggingfacehub_api_token=api_key,
model_kwargs={"temperature": 0.7, "max_length": 500}
)
return llm
except Exception as e:
st.error(f"❌ HuggingFace error: {e}")
return None
def extract_linkedin_data(url, data_type):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
if response.status_code != 200:
return f"❌ Failed to access page (Status: {response.status_code})"
soup = BeautifulSoup(response.text, 'html.parser')
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
paragraphs = text.split('.')
meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
if not meaningful_content:
return "❌ No meaningful content found."
result = f"πŸ”— URL: {url}\n"
result += "="*50 + "\n\n"
for i, content in enumerate(meaningful_content[:10], 1):
result += f"{i}. {content}\n\n"
result += "="*50 + "\n"
result += f"βœ… Extracted {len(meaningful_content)} content blocks\n"
return result
except Exception as e:
return f"❌ Error: {str(e)}"
def get_text_chunks(text):
if not text.strip():
return []
splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
return splitter.split_text(text)
def get_vectorstore(text_chunks):
if not text_chunks:
return None
documents = [Document(page_content=chunk) for chunk in text_chunks]
embeddings = get_embeddings()
if embeddings is None:
return None
vectorstore = FAISS.from_documents(documents, embeddings)
return vectorstore
def get_conversation_chain(vectorstore):
if vectorstore is None:
return None
try:
llm = get_llm()
if llm is None:
return None
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
memory=memory,
return_source_documents=True
)
return chain
except Exception as e:
st.error(f"❌ Error: {e}")
return None
def main():
st.title("πŸ’Ό LinkedIn AI Analyzer")
if st.button("← Back to Main Dashboard"):
st.switch_page("app.py")
# Initialize session state
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "processed" not in st.session_state:
st.session_state.processed = False
if "extracted_data" not in st.session_state:
st.session_state.extracted_data = ""
# Sidebar
with st.sidebar:
data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
url_placeholder = {
"profile": "https://www.linkedin.com/in/username/",
"company": "https://www.linkedin.com/company/companyname/",
"post": "https://www.linkedin.com/posts/username_postid/"
}
linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
if st.button("πŸš€ Extract & Analyze", type="primary"):
if not linkedin_url.strip():
st.warning("Please enter a LinkedIn URL")
else:
with st.spinner("πŸ”„ Extracting data..."):
extracted_data = extract_linkedin_data(linkedin_url, data_type)
if extracted_data and not extracted_data.startswith("❌"):
chunks = get_text_chunks(extracted_data)
if chunks:
vectorstore = get_vectorstore(chunks)
conversation = get_conversation_chain(vectorstore)
if conversation:
st.session_state.conversation = conversation
st.session_state.processed = True
st.session_state.extracted_data = extracted_data
st.session_state.chat_history = []
st.success(f"βœ… Ready to analyze {len(chunks)} content chunks!")
else:
st.error("❌ Failed to initialize AI")
else:
st.error("❌ No content extracted")
else:
st.error(extracted_data)
# Main content
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("### πŸ’¬ Chat")
for i, chat in enumerate(st.session_state.chat_history):
if chat["role"] == "user":
st.markdown(f"**πŸ‘€ You:** {chat['content']}")
elif chat["role"] == "assistant":
if chat["content"]:
st.markdown(f"**πŸ€– Assistant:** {chat['content']}")
if st.session_state.processed:
user_input = st.chat_input("Ask about the LinkedIn data...")
if user_input:
st.session_state.chat_history.append({"role": "user", "content": user_input})
with st.spinner("πŸ€” Analyzing..."):
try:
if st.session_state.conversation:
response = st.session_state.conversation.invoke({"question": user_input})
answer = response.get("answer", "No response generated.")
st.session_state.chat_history.append({"role": "assistant", "content": answer})
st.rerun()
except Exception as e:
st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
st.rerun()
else:
st.info("πŸ‘‹ Enter a LinkedIn URL and click 'Extract & Analyze' to start")
with col2:
if st.session_state.processed:
st.markdown("### πŸ“Š Overview")
data = st.session_state.extracted_data
chunks = get_text_chunks(data)
st.metric("Content Type", data_type.title())
st.metric("Text Chunks", len(chunks))
st.metric("Characters", f"{len(data):,}")
if __name__ == "__main__":
main()