|
|
|
|
|
import streamlit as st |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from langchain_text_splitters import CharacterTextSplitter |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain.memory import ConversationBufferMemory |
|
|
from langchain.chains import ConversationalRetrievalChain |
|
|
from langchain_core.documents import Document |
|
|
from langchain_community.llms import HuggingFaceHub |
|
|
import re |
|
|
import time |
|
|
import os |
|
|
|
|
|
st.set_page_config( |
|
|
page_title="LinkedIn AI Analyzer", |
|
|
page_icon="πΌ", |
|
|
layout="wide" |
|
|
) |
|
|
|
|
|
def get_embeddings(): |
|
|
"""Initialize embeddings with multiple fallback options""" |
|
|
try: |
|
|
|
|
|
model_options = [ |
|
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
|
"sentence-transformers/paraphrase-albert-small-v2", |
|
|
"sentence-transformers/all-mpnet-base-v2" |
|
|
] |
|
|
|
|
|
for model_name in model_options: |
|
|
try: |
|
|
embeddings = HuggingFaceEmbeddings( |
|
|
model_name=model_name, |
|
|
model_kwargs={'device': 'cpu'}, |
|
|
encode_kwargs={'normalize_embeddings': True} |
|
|
) |
|
|
st.success(f"β
Loaded embeddings: {model_name.split('/')[-1]}") |
|
|
return embeddings |
|
|
except Exception as e: |
|
|
continue |
|
|
|
|
|
st.error("β All embedding models failed to load") |
|
|
return None |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"β Embeddings error: {e}") |
|
|
return None |
|
|
|
|
|
def get_llm(): |
|
|
"""Initialize Mistral 7B LLM - Best for analysis""" |
|
|
try: |
|
|
api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN') |
|
|
if not api_key: |
|
|
st.error(""" |
|
|
β HuggingFace API Key not found! |
|
|
|
|
|
Please add your API key: |
|
|
1. Go to Space Settings β Variables and Secrets |
|
|
2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here" |
|
|
3. Restart the Space |
|
|
|
|
|
Get free API key: https://huggingface.co/settings/tokens |
|
|
""") |
|
|
return None |
|
|
|
|
|
|
|
|
llm = HuggingFaceHub( |
|
|
repo_id="mistralai/Mistral-7B-Instruct-v0.1", |
|
|
huggingfacehub_api_token=api_key, |
|
|
model_kwargs={ |
|
|
"temperature": 0.7, |
|
|
"max_length": 2048, |
|
|
"max_new_tokens": 512, |
|
|
"top_p": 0.95, |
|
|
"repetition_penalty": 1.1, |
|
|
"do_sample": True |
|
|
} |
|
|
) |
|
|
return llm |
|
|
except Exception as e: |
|
|
st.error(f"β AI Model error: {e}") |
|
|
return None |
|
|
|
|
|
def extract_linkedin_data(url, data_type): |
|
|
"""Extract data from LinkedIn URLs""" |
|
|
try: |
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
|
|
'Accept-Language': 'en-US,en;q=0.5', |
|
|
'Accept-Encoding': 'gzip, deflate, br', |
|
|
'DNT': '1', |
|
|
'Connection': 'keep-alive', |
|
|
'Upgrade-Insecure-Requests': '1', |
|
|
} |
|
|
|
|
|
st.info(f"π Accessing: {url}") |
|
|
response = requests.get(url, headers=headers, timeout=25) |
|
|
|
|
|
if response.status_code != 200: |
|
|
return { |
|
|
"error": f"Failed to access page (Status: {response.status_code})", |
|
|
"status": "error" |
|
|
} |
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
|
|
|
for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]): |
|
|
script.decompose() |
|
|
|
|
|
|
|
|
text = soup.get_text() |
|
|
lines = (line.strip() for line in text.splitlines()) |
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
|
clean_text = ' '.join(chunk for chunk in chunks if chunk) |
|
|
|
|
|
|
|
|
paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 40] |
|
|
|
|
|
if not paragraphs: |
|
|
return { |
|
|
"error": "No meaningful content found. The page might require login or have restricted access.", |
|
|
"status": "error" |
|
|
} |
|
|
|
|
|
|
|
|
title = soup.find('title') |
|
|
page_title = title.text.strip() if title else "LinkedIn Page" |
|
|
|
|
|
|
|
|
extracted_data = { |
|
|
"page_info": { |
|
|
"title": page_title, |
|
|
"url": url, |
|
|
"response_code": response.status_code, |
|
|
"content_length": len(clean_text) |
|
|
}, |
|
|
"content_blocks": paragraphs, |
|
|
"extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'), |
|
|
"data_type": data_type, |
|
|
"status": "success" |
|
|
} |
|
|
|
|
|
return extracted_data |
|
|
|
|
|
except requests.exceptions.Timeout: |
|
|
return {"error": "Request timed out. Please try again.", "status": "error"} |
|
|
except requests.exceptions.ConnectionError: |
|
|
return {"error": "Connection failed. Please check the URL and try again.", "status": "error"} |
|
|
except Exception as e: |
|
|
return {"error": f"Extraction error: {str(e)}", "status": "error"} |
|
|
|
|
|
def process_extracted_data(extracted_data): |
|
|
"""Process extracted data for AI analysis""" |
|
|
if not extracted_data or extracted_data.get("status") != "success": |
|
|
return None, [] |
|
|
|
|
|
page_info = extracted_data['page_info'] |
|
|
content_blocks = extracted_data['content_blocks'] |
|
|
|
|
|
|
|
|
all_text = f"LINKEDIN DATA ANALYSIS REPORT\n" |
|
|
all_text += "=" * 70 + "\n\n" |
|
|
all_text += f"π PAGE INFORMATION:\n" |
|
|
all_text += f"Title: {page_info['title']}\n" |
|
|
all_text += f"URL: {page_info['url']}\n" |
|
|
all_text += f"Type: {extracted_data['data_type'].upper()}\n" |
|
|
all_text += f"Extracted: {extracted_data['extraction_time']}\n" |
|
|
all_text += f"Response Code: {page_info['response_code']}\n" |
|
|
all_text += f"Content Length: {page_info['content_length']} characters\n\n" |
|
|
|
|
|
all_text += f"π CONTENT ANALYSIS:\n" |
|
|
all_text += f"Total Content Blocks: {len(content_blocks)}\n\n" |
|
|
|
|
|
|
|
|
for i, block in enumerate(content_blocks[:20]): |
|
|
all_text += f"--- CONTENT BLOCK {i+1} ---\n" |
|
|
all_text += f"Words: {len(block.split())} | Characters: {len(block)}\n" |
|
|
all_text += f"Content: {block}\n\n" |
|
|
|
|
|
all_text += "=" * 70 + "\n" |
|
|
all_text += "END OF EXTRACTION REPORT" |
|
|
|
|
|
|
|
|
splitter = CharacterTextSplitter( |
|
|
separator="\n", |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200, |
|
|
length_function=len |
|
|
) |
|
|
|
|
|
chunks = splitter.split_text(all_text) |
|
|
documents = [Document(page_content=chunk) for chunk in chunks] |
|
|
|
|
|
|
|
|
try: |
|
|
embeddings = get_embeddings() |
|
|
if embeddings is None: |
|
|
return None, [] |
|
|
vectorstore = FAISS.from_documents(documents, embeddings) |
|
|
return vectorstore, chunks |
|
|
except Exception as e: |
|
|
st.error(f"Vector store creation failed: {e}") |
|
|
return None, [] |
|
|
|
|
|
def create_chatbot(vectorstore): |
|
|
"""Create conversational chatbot with Mistral""" |
|
|
try: |
|
|
llm = get_llm() |
|
|
if llm is None: |
|
|
return None |
|
|
|
|
|
memory = ConversationBufferMemory( |
|
|
memory_key="chat_history", |
|
|
return_messages=True, |
|
|
output_key="answer" |
|
|
) |
|
|
|
|
|
chain = ConversationalRetrievalChain.from_llm( |
|
|
llm=llm, |
|
|
retriever=vectorstore.as_retriever(search_kwargs={"k": 4}), |
|
|
memory=memory, |
|
|
return_source_documents=True, |
|
|
output_key="answer" |
|
|
) |
|
|
return chain |
|
|
except Exception as e: |
|
|
st.error(f"Failed to create chatbot: {str(e)}") |
|
|
return None |
|
|
|
|
|
def clear_chat_history(): |
|
|
"""Clear chat history while keeping extracted data""" |
|
|
if "vectorstore" in st.session_state and st.session_state.vectorstore: |
|
|
st.session_state.chatbot = create_chatbot(st.session_state.vectorstore) |
|
|
st.session_state.chat_history = [] |
|
|
st.success("π Chat history cleared! Starting fresh conversation.") |
|
|
|
|
|
def display_metrics(extracted_data): |
|
|
"""Display extraction metrics""" |
|
|
if not extracted_data: |
|
|
return |
|
|
|
|
|
page_info = extracted_data['page_info'] |
|
|
content_blocks = extracted_data['content_blocks'] |
|
|
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
|
|
|
with col1: |
|
|
st.metric("Content Blocks", len(content_blocks)) |
|
|
|
|
|
with col2: |
|
|
total_words = sum(len(block.split()) for block in content_blocks) |
|
|
st.metric("Total Words", total_words) |
|
|
|
|
|
with col3: |
|
|
st.metric("Characters", f"{page_info['content_length']:,}") |
|
|
|
|
|
with col4: |
|
|
st.metric("Response Code", page_info['response_code']) |
|
|
|
|
|
def main(): |
|
|
st.title("πΌ LinkedIn AI Analyzer") |
|
|
|
|
|
if st.button("β Back to Main Dashboard"): |
|
|
st.switch_page("app.py") |
|
|
|
|
|
|
|
|
if "extracted_data" not in st.session_state: |
|
|
st.session_state.extracted_data = None |
|
|
if "vectorstore" not in st.session_state: |
|
|
st.session_state.vectorstore = None |
|
|
if "chatbot" not in st.session_state: |
|
|
st.session_state.chatbot = None |
|
|
if "chat_history" not in st.session_state: |
|
|
st.session_state.chat_history = [] |
|
|
if "processing" not in st.session_state: |
|
|
st.session_state.processing = False |
|
|
if "current_url" not in st.session_state: |
|
|
st.session_state.current_url = "" |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.markdown("### βοΈ Configuration") |
|
|
|
|
|
|
|
|
data_type = st.selectbox( |
|
|
"π Content Type", |
|
|
["profile", "company", "post"], |
|
|
help="Select the type of LinkedIn content" |
|
|
) |
|
|
|
|
|
|
|
|
url_placeholder = { |
|
|
"profile": "https://www.linkedin.com/in/username/", |
|
|
"company": "https://www.linkedin.com/company/companyname/", |
|
|
"post": "https://www.linkedin.com/posts/username_postid/" |
|
|
} |
|
|
|
|
|
linkedin_url = st.text_input( |
|
|
"π LinkedIn URL", |
|
|
placeholder=url_placeholder[data_type], |
|
|
help="Enter a public LinkedIn URL" |
|
|
) |
|
|
|
|
|
|
|
|
st.markdown("### π Quick Test") |
|
|
suggested_urls = { |
|
|
"Microsoft": "https://www.linkedin.com/company/microsoft/", |
|
|
"Google": "https://www.linkedin.com/company/google/", |
|
|
"Apple": "https://www.linkedin.com/company/apple/", |
|
|
"Amazon": "https://www.linkedin.com/company/amazon/" |
|
|
} |
|
|
|
|
|
for name, url in suggested_urls.items(): |
|
|
if st.button(f"π’ {name}", key=name, use_container_width=True): |
|
|
st.session_state.current_url = url |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
if st.button("π Extract & Analyze", type="primary", use_container_width=True): |
|
|
url_to_use = linkedin_url.strip() or st.session_state.current_url |
|
|
|
|
|
if not url_to_use: |
|
|
st.warning("β οΈ Please enter a LinkedIn URL") |
|
|
elif not url_to_use.startswith('https://www.linkedin.com/'): |
|
|
st.error("β Please enter a valid LinkedIn URL") |
|
|
else: |
|
|
st.session_state.processing = True |
|
|
with st.spinner("π Extracting and analyzing data..."): |
|
|
extracted_data = extract_linkedin_data(url_to_use, data_type) |
|
|
|
|
|
if extracted_data.get("status") == "success": |
|
|
st.session_state.extracted_data = extracted_data |
|
|
st.session_state.current_url = url_to_use |
|
|
|
|
|
|
|
|
vectorstore, chunks = process_extracted_data(extracted_data) |
|
|
if vectorstore: |
|
|
st.session_state.vectorstore = vectorstore |
|
|
st.session_state.chatbot = create_chatbot(vectorstore) |
|
|
st.session_state.chat_history = [] |
|
|
st.success(f"β
Successfully processed {len(chunks)} content chunks!") |
|
|
st.balloons() |
|
|
else: |
|
|
st.error("β Failed to process data for AI analysis") |
|
|
else: |
|
|
error_msg = extracted_data.get("error", "Unknown error occurred") |
|
|
st.error(f"β Extraction failed: {error_msg}") |
|
|
|
|
|
st.session_state.processing = False |
|
|
|
|
|
|
|
|
if st.session_state.chatbot and st.session_state.extracted_data: |
|
|
st.markdown("---") |
|
|
st.subheader("π¬ Chat Management") |
|
|
if st.button("ποΈ Clear Chat History", type="secondary", use_container_width=True): |
|
|
clear_chat_history() |
|
|
|
|
|
|
|
|
col1, col2 = st.columns([1, 1]) |
|
|
|
|
|
with col1: |
|
|
st.markdown("### π Extraction Results") |
|
|
|
|
|
if st.session_state.processing: |
|
|
st.info("π Processing LinkedIn data...") |
|
|
|
|
|
elif st.session_state.extracted_data: |
|
|
data = st.session_state.extracted_data |
|
|
page_info = data['page_info'] |
|
|
content_blocks = data['content_blocks'] |
|
|
|
|
|
st.success("β
Extraction Complete") |
|
|
|
|
|
|
|
|
display_metrics(data) |
|
|
|
|
|
|
|
|
st.markdown("#### π·οΈ Page Information") |
|
|
st.write(f"**Title:** {page_info['title']}") |
|
|
st.write(f"**URL:** {page_info['url']}") |
|
|
st.write(f"**Data Type:** {data['data_type'].title()}") |
|
|
st.write(f"**Content Blocks:** {len(content_blocks)}") |
|
|
st.write(f"**Extraction Time:** {data['extraction_time']}") |
|
|
|
|
|
|
|
|
st.markdown("#### π Sample Content") |
|
|
for i, block in enumerate(content_blocks[:3]): |
|
|
with st.expander(f"Content Block {i+1} ({len(block.split())} words)"): |
|
|
st.write(block) |
|
|
|
|
|
if len(content_blocks) > 3: |
|
|
st.info(f"π And {len(content_blocks) - 3} more content blocks...") |
|
|
|
|
|
else: |
|
|
st.info(""" |
|
|
π **Welcome to LinkedIn AI Analyzer!** |
|
|
|
|
|
**Powered by Mistral 7B AI** |
|
|
|
|
|
**To get started:** |
|
|
1. Select content type |
|
|
2. Enter a LinkedIn URL or click a suggested company |
|
|
3. Click "Extract & Analyze" |
|
|
4. Chat with AI about the extracted content |
|
|
|
|
|
**Supported URLs:** |
|
|
- π€ Public Profiles |
|
|
- π’ Company Pages |
|
|
- π Public Posts |
|
|
|
|
|
**AI Features:** |
|
|
- Smart content analysis |
|
|
- Conversational chat |
|
|
- Data insights |
|
|
- Content summarization |
|
|
""") |
|
|
|
|
|
with col2: |
|
|
st.markdown("### π¬ AI Chat Analysis") |
|
|
|
|
|
if st.session_state.chatbot and st.session_state.extracted_data: |
|
|
|
|
|
for i, chat in enumerate(st.session_state.chat_history): |
|
|
if chat["role"] == "user": |
|
|
st.markdown(f"**π€ You:** {chat['content']}") |
|
|
elif chat["role"] == "assistant": |
|
|
st.markdown(f"**π€ AI:** {chat['content']}") |
|
|
|
|
|
|
|
|
user_input = st.chat_input("Ask about the LinkedIn data...") |
|
|
|
|
|
if user_input: |
|
|
|
|
|
st.session_state.chat_history.append({"role": "user", "content": user_input}) |
|
|
|
|
|
|
|
|
with st.spinner("π€ Mistral AI is analyzing..."): |
|
|
try: |
|
|
response = st.session_state.chatbot.invoke({"question": user_input}) |
|
|
answer = response.get("answer", "I couldn't generate a response based on the available data.") |
|
|
|
|
|
st.session_state.chat_history.append({"role": "assistant", "content": answer}) |
|
|
st.rerun() |
|
|
except Exception as e: |
|
|
error_msg = f"β Error generating response: {str(e)}" |
|
|
st.session_state.chat_history.append({"role": "assistant", "content": error_msg}) |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
if not st.session_state.chat_history: |
|
|
st.markdown("#### π‘ Suggested Questions") |
|
|
suggestions = [ |
|
|
"Summarize the main information from this page", |
|
|
"What are the key highlights or achievements?", |
|
|
"Analyze the business or professional focus", |
|
|
"What insights can you extract from this content?", |
|
|
"Provide a comprehensive overview" |
|
|
] |
|
|
|
|
|
for suggestion in suggestions: |
|
|
if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True): |
|
|
st.info(f"π‘ Try asking: '{suggestion}'") |
|
|
|
|
|
elif st.session_state.extracted_data: |
|
|
st.info("π¬ Start a conversation with the AI assistant") |
|
|
else: |
|
|
st.info("π Extract LinkedIn data to enable AI analysis") |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("### π Powered by Mistral 7B AI") |
|
|
|
|
|
feature_cols = st.columns(3) |
|
|
|
|
|
with feature_cols[0]: |
|
|
st.markdown(""" |
|
|
**π€ Advanced AI** |
|
|
- Mistral 7B Instruct model |
|
|
- Intelligent text analysis |
|
|
- Contextual understanding |
|
|
""") |
|
|
|
|
|
with feature_cols[1]: |
|
|
st.markdown(""" |
|
|
**π¬ Smart Chat** |
|
|
- Conversational memory |
|
|
- Relevant responses |
|
|
- Data-driven insights |
|
|
""") |
|
|
|
|
|
with feature_cols[2]: |
|
|
st.markdown(""" |
|
|
**π Deep Analysis** |
|
|
- Content summarization |
|
|
- Pattern recognition |
|
|
- Professional insights |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |