|
|
import streamlit as st |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from langchain_text_splitters import CharacterTextSplitter |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain.memory import ConversationBufferMemory |
|
|
from langchain.chains import ConversationalRetrievalChain |
|
|
from langchain_core.documents import Document |
|
|
from langchain_community.llms import HuggingFaceHub |
|
|
import re |
|
|
import time |
|
|
import os |
|
|
|
|
|
st.set_page_config( |
|
|
page_title="LinkedIn AI Analyzer", |
|
|
page_icon="πΌ", |
|
|
layout="wide" |
|
|
) |
|
|
|
|
|
def get_embeddings(): |
|
|
try: |
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
return embeddings |
|
|
except Exception as e: |
|
|
st.error(f"β Failed to load embeddings: {e}") |
|
|
return None |
|
|
|
|
|
def get_llm(): |
|
|
try: |
|
|
api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN') |
|
|
if not api_key: |
|
|
st.error("β HuggingFace API Key not found in environment variables") |
|
|
return None |
|
|
|
|
|
llm = HuggingFaceHub( |
|
|
repo_id="google/flan-t5-large", |
|
|
huggingfacehub_api_token=api_key, |
|
|
model_kwargs={"temperature": 0.7, "max_length": 500} |
|
|
) |
|
|
return llm |
|
|
except Exception as e: |
|
|
st.error(f"β HuggingFace error: {e}") |
|
|
return None |
|
|
|
|
|
def extract_linkedin_data(url, data_type): |
|
|
try: |
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
|
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=15) |
|
|
if response.status_code != 200: |
|
|
return f"β Failed to access page (Status: {response.status_code})" |
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
for script in soup(["script", "style"]): |
|
|
script.decompose() |
|
|
|
|
|
text = soup.get_text() |
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
text = ' '.join(chunk for chunk in chunks if chunk) |
|
|
|
|
|
paragraphs = text.split('.') |
|
|
meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50] |
|
|
|
|
|
if not meaningful_content: |
|
|
return "β No meaningful content found." |
|
|
|
|
|
result = f"π URL: {url}\n" |
|
|
result += "="*50 + "\n\n" |
|
|
|
|
|
for i, content in enumerate(meaningful_content[:10], 1): |
|
|
result += f"{i}. {content}\n\n" |
|
|
|
|
|
result += "="*50 + "\n" |
|
|
result += f"β
Extracted {len(meaningful_content)} content blocks\n" |
|
|
|
|
|
return result |
|
|
|
|
|
except Exception as e: |
|
|
return f"β Error: {str(e)}" |
|
|
|
|
|
def get_text_chunks(text): |
|
|
if not text.strip(): |
|
|
return [] |
|
|
splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200) |
|
|
return splitter.split_text(text) |
|
|
|
|
|
def get_vectorstore(text_chunks): |
|
|
if not text_chunks: |
|
|
return None |
|
|
documents = [Document(page_content=chunk) for chunk in text_chunks] |
|
|
embeddings = get_embeddings() |
|
|
if embeddings is None: |
|
|
return None |
|
|
vectorstore = FAISS.from_documents(documents, embeddings) |
|
|
return vectorstore |
|
|
|
|
|
def get_conversation_chain(vectorstore): |
|
|
if vectorstore is None: |
|
|
return None |
|
|
try: |
|
|
llm = get_llm() |
|
|
if llm is None: |
|
|
return None |
|
|
|
|
|
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) |
|
|
chain = ConversationalRetrievalChain.from_llm( |
|
|
llm=llm, |
|
|
retriever=vectorstore.as_retriever(search_kwargs={"k": 3}), |
|
|
memory=memory, |
|
|
return_source_documents=True |
|
|
) |
|
|
return chain |
|
|
except Exception as e: |
|
|
st.error(f"β Error: {e}") |
|
|
return None |
|
|
|
|
|
def main(): |
|
|
st.title("πΌ LinkedIn AI Analyzer") |
|
|
|
|
|
if st.button("β Back to Main Dashboard"): |
|
|
st.switch_page("app.py") |
|
|
|
|
|
|
|
|
if "conversation" not in st.session_state: |
|
|
st.session_state.conversation = None |
|
|
if "chat_history" not in st.session_state: |
|
|
st.session_state.chat_history = [] |
|
|
if "processed" not in st.session_state: |
|
|
st.session_state.processed = False |
|
|
if "extracted_data" not in st.session_state: |
|
|
st.session_state.extracted_data = "" |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
data_type = st.selectbox("π Content Type", ["profile", "company", "post"]) |
|
|
|
|
|
url_placeholder = { |
|
|
"profile": "https://www.linkedin.com/in/username/", |
|
|
"company": "https://www.linkedin.com/company/companyname/", |
|
|
"post": "https://www.linkedin.com/posts/username_postid/" |
|
|
} |
|
|
|
|
|
linkedin_url = st.text_input("π LinkedIn URL", placeholder=url_placeholder[data_type]) |
|
|
|
|
|
if st.button("π Extract & Analyze", type="primary"): |
|
|
if not linkedin_url.strip(): |
|
|
st.warning("Please enter a LinkedIn URL") |
|
|
else: |
|
|
with st.spinner("π Extracting data..."): |
|
|
extracted_data = extract_linkedin_data(linkedin_url, data_type) |
|
|
|
|
|
if extracted_data and not extracted_data.startswith("β"): |
|
|
chunks = get_text_chunks(extracted_data) |
|
|
if chunks: |
|
|
vectorstore = get_vectorstore(chunks) |
|
|
conversation = get_conversation_chain(vectorstore) |
|
|
if conversation: |
|
|
st.session_state.conversation = conversation |
|
|
st.session_state.processed = True |
|
|
st.session_state.extracted_data = extracted_data |
|
|
st.session_state.chat_history = [] |
|
|
st.success(f"β
Ready to analyze {len(chunks)} content chunks!") |
|
|
else: |
|
|
st.error("β Failed to initialize AI") |
|
|
else: |
|
|
st.error("β No content extracted") |
|
|
else: |
|
|
st.error(extracted_data) |
|
|
|
|
|
|
|
|
col1, col2 = st.columns([2, 1]) |
|
|
|
|
|
with col1: |
|
|
st.markdown("### π¬ Chat") |
|
|
|
|
|
for i, chat in enumerate(st.session_state.chat_history): |
|
|
if chat["role"] == "user": |
|
|
st.markdown(f"**π€ You:** {chat['content']}") |
|
|
elif chat["role"] == "assistant": |
|
|
if chat["content"]: |
|
|
st.markdown(f"**π€ Assistant:** {chat['content']}") |
|
|
|
|
|
if st.session_state.processed: |
|
|
user_input = st.chat_input("Ask about the LinkedIn data...") |
|
|
if user_input: |
|
|
st.session_state.chat_history.append({"role": "user", "content": user_input}) |
|
|
with st.spinner("π€ Analyzing..."): |
|
|
try: |
|
|
if st.session_state.conversation: |
|
|
response = st.session_state.conversation.invoke({"question": user_input}) |
|
|
answer = response.get("answer", "No response generated.") |
|
|
st.session_state.chat_history.append({"role": "assistant", "content": answer}) |
|
|
st.rerun() |
|
|
except Exception as e: |
|
|
st.session_state.chat_history.append({"role": "assistant", "content": f"β Error: {str(e)}"}) |
|
|
st.rerun() |
|
|
else: |
|
|
st.info("π Enter a LinkedIn URL and click 'Extract & Analyze' to start") |
|
|
|
|
|
with col2: |
|
|
if st.session_state.processed: |
|
|
st.markdown("### π Overview") |
|
|
data = st.session_state.extracted_data |
|
|
chunks = get_text_chunks(data) |
|
|
|
|
|
st.metric("Content Type", data_type.title()) |
|
|
st.metric("Text Chunks", len(chunks)) |
|
|
st.metric("Characters", f"{len(data):,}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |