# pages/linkedin_extractor.py import streamlit as st import requests from bs4 import BeautifulSoup from langchain_text_splitters import CharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from langchain_core.documents import Document from langchain_community.llms import HuggingFaceHub import re import time import os st.set_page_config( page_title="LinkedIn AI Analyzer", page_icon="💼", layout="wide" ) def get_embeddings(): """Initialize embeddings with multiple fallback options""" try: # Try multiple embedding models model_options = [ "sentence-transformers/all-MiniLM-L6-v2", # Default "sentence-transformers/paraphrase-albert-small-v2", # Smaller alternative "sentence-transformers/all-mpnet-base-v2" # Higher quality ] for model_name in model_options: try: embeddings = HuggingFaceEmbeddings( model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) st.success(f"✅ Loaded embeddings: {model_name.split('/')[-1]}") return embeddings except Exception as e: continue st.error("❌ All embedding models failed to load") return None except Exception as e: st.error(f"❌ Embeddings error: {e}") return None def get_llm(): """Initialize Mistral 7B LLM - Best for analysis""" try: api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN') if not api_key: st.error(""" ❌ HuggingFace API Key not found! Please add your API key: 1. Go to Space Settings → Variables and Secrets 2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here" 3. Restart the Space Get free API key: https://huggingface.co/settings/tokens """) return None # Using Mistral 7B - Best balance of quality and accessibility llm = HuggingFaceHub( repo_id="mistralai/Mistral-7B-Instruct-v0.1", huggingfacehub_api_token=api_key, model_kwargs={ "temperature": 0.7, "max_length": 2048, "max_new_tokens": 512, "top_p": 0.95, "repetition_penalty": 1.1, "do_sample": True } ) return llm except Exception as e: st.error(f"❌ AI Model error: {e}") return None def extract_linkedin_data(url, data_type): """Extract data from LinkedIn URLs""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } st.info(f"🌐 Accessing: {url}") response = requests.get(url, headers=headers, timeout=25) if response.status_code != 200: return { "error": f"Failed to access page (Status: {response.status_code})", "status": "error" } soup = BeautifulSoup(response.text, 'html.parser') # Remove scripts and styles for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]): script.decompose() # Extract and clean text text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) clean_text = ' '.join(chunk for chunk in chunks if chunk) # Extract meaningful content paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 40] if not paragraphs: return { "error": "No meaningful content found. The page might require login or have restricted access.", "status": "error" } # Extract page title title = soup.find('title') page_title = title.text.strip() if title else "LinkedIn Page" # Structure the extracted data extracted_data = { "page_info": { "title": page_title, "url": url, "response_code": response.status_code, "content_length": len(clean_text) }, "content_blocks": paragraphs, "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'), "data_type": data_type, "status": "success" } return extracted_data except requests.exceptions.Timeout: return {"error": "Request timed out. Please try again.", "status": "error"} except requests.exceptions.ConnectionError: return {"error": "Connection failed. Please check the URL and try again.", "status": "error"} except Exception as e: return {"error": f"Extraction error: {str(e)}", "status": "error"} def process_extracted_data(extracted_data): """Process extracted data for AI analysis""" if not extracted_data or extracted_data.get("status") != "success": return None, [] page_info = extracted_data['page_info'] content_blocks = extracted_data['content_blocks'] # Structure the data for AI all_text = f"LINKEDIN DATA ANALYSIS REPORT\n" all_text += "=" * 70 + "\n\n" all_text += f"📄 PAGE INFORMATION:\n" all_text += f"Title: {page_info['title']}\n" all_text += f"URL: {page_info['url']}\n" all_text += f"Type: {extracted_data['data_type'].upper()}\n" all_text += f"Extracted: {extracted_data['extraction_time']}\n" all_text += f"Response Code: {page_info['response_code']}\n" all_text += f"Content Length: {page_info['content_length']} characters\n\n" all_text += f"📊 CONTENT ANALYSIS:\n" all_text += f"Total Content Blocks: {len(content_blocks)}\n\n" # Add content blocks for i, block in enumerate(content_blocks[:20]): all_text += f"--- CONTENT BLOCK {i+1} ---\n" all_text += f"Words: {len(block.split())} | Characters: {len(block)}\n" all_text += f"Content: {block}\n\n" all_text += "=" * 70 + "\n" all_text += "END OF EXTRACTION REPORT" # Split into chunks splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = splitter.split_text(all_text) documents = [Document(page_content=chunk) for chunk in chunks] # Create vector store try: embeddings = get_embeddings() if embeddings is None: return None, [] vectorstore = FAISS.from_documents(documents, embeddings) return vectorstore, chunks except Exception as e: st.error(f"Vector store creation failed: {e}") return None, [] def create_chatbot(vectorstore): """Create conversational chatbot with Mistral""" try: llm = get_llm() if llm is None: return None memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True, output_key="answer" ) chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=vectorstore.as_retriever(search_kwargs={"k": 4}), memory=memory, return_source_documents=True, output_key="answer" ) return chain except Exception as e: st.error(f"Failed to create chatbot: {str(e)}") return None def clear_chat_history(): """Clear chat history while keeping extracted data""" if "vectorstore" in st.session_state and st.session_state.vectorstore: st.session_state.chatbot = create_chatbot(st.session_state.vectorstore) st.session_state.chat_history = [] st.success("🔄 Chat history cleared! Starting fresh conversation.") def display_metrics(extracted_data): """Display extraction metrics""" if not extracted_data: return page_info = extracted_data['page_info'] content_blocks = extracted_data['content_blocks'] col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Content Blocks", len(content_blocks)) with col2: total_words = sum(len(block.split()) for block in content_blocks) st.metric("Total Words", total_words) with col3: st.metric("Characters", f"{page_info['content_length']:,}") with col4: st.metric("Response Code", page_info['response_code']) def main(): st.title("💼 LinkedIn AI Analyzer") if st.button("← Back to Main Dashboard"): st.switch_page("app.py") # Initialize session state if "extracted_data" not in st.session_state: st.session_state.extracted_data = None if "vectorstore" not in st.session_state: st.session_state.vectorstore = None if "chatbot" not in st.session_state: st.session_state.chatbot = None if "chat_history" not in st.session_state: st.session_state.chat_history = [] if "processing" not in st.session_state: st.session_state.processing = False if "current_url" not in st.session_state: st.session_state.current_url = "" # Sidebar with st.sidebar: st.markdown("### ⚙️ Configuration") # Data type selection data_type = st.selectbox( "📊 Content Type", ["profile", "company", "post"], help="Select the type of LinkedIn content" ) # URL input url_placeholder = { "profile": "https://www.linkedin.com/in/username/", "company": "https://www.linkedin.com/company/companyname/", "post": "https://www.linkedin.com/posts/username_postid/" } linkedin_url = st.text_input( "🌐 LinkedIn URL", placeholder=url_placeholder[data_type], help="Enter a public LinkedIn URL" ) # Suggested URLs st.markdown("### 🚀 Quick Test") suggested_urls = { "Microsoft": "https://www.linkedin.com/company/microsoft/", "Google": "https://www.linkedin.com/company/google/", "Apple": "https://www.linkedin.com/company/apple/", "Amazon": "https://www.linkedin.com/company/amazon/" } for name, url in suggested_urls.items(): if st.button(f"🏢 {name}", key=name, use_container_width=True): st.session_state.current_url = url st.rerun() # Extract button if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True): url_to_use = linkedin_url.strip() or st.session_state.current_url if not url_to_use: st.warning("⚠️ Please enter a LinkedIn URL") elif not url_to_use.startswith('https://www.linkedin.com/'): st.error("❌ Please enter a valid LinkedIn URL") else: st.session_state.processing = True with st.spinner("🔄 Extracting and analyzing data..."): extracted_data = extract_linkedin_data(url_to_use, data_type) if extracted_data.get("status") == "success": st.session_state.extracted_data = extracted_data st.session_state.current_url = url_to_use # Process for AI vectorstore, chunks = process_extracted_data(extracted_data) if vectorstore: st.session_state.vectorstore = vectorstore st.session_state.chatbot = create_chatbot(vectorstore) st.session_state.chat_history = [] st.success(f"✅ Successfully processed {len(chunks)} content chunks!") st.balloons() else: st.error("❌ Failed to process data for AI analysis") else: error_msg = extracted_data.get("error", "Unknown error occurred") st.error(f"❌ Extraction failed: {error_msg}") st.session_state.processing = False # Chat management if st.session_state.chatbot and st.session_state.extracted_data: st.markdown("---") st.subheader("💬 Chat Management") if st.button("🗑️ Clear Chat History", type="secondary", use_container_width=True): clear_chat_history() # Main content area col1, col2 = st.columns([1, 1]) with col1: st.markdown("### 📊 Extraction Results") if st.session_state.processing: st.info("🔄 Processing LinkedIn data...") elif st.session_state.extracted_data: data = st.session_state.extracted_data page_info = data['page_info'] content_blocks = data['content_blocks'] st.success("✅ Extraction Complete") # Display metrics display_metrics(data) # Display page info st.markdown("#### 🏷️ Page Information") st.write(f"**Title:** {page_info['title']}") st.write(f"**URL:** {page_info['url']}") st.write(f"**Data Type:** {data['data_type'].title()}") st.write(f"**Content Blocks:** {len(content_blocks)}") st.write(f"**Extraction Time:** {data['extraction_time']}") # Display sample content st.markdown("#### 📝 Sample Content") for i, block in enumerate(content_blocks[:3]): with st.expander(f"Content Block {i+1} ({len(block.split())} words)"): st.write(block) if len(content_blocks) > 3: st.info(f"📄 And {len(content_blocks) - 3} more content blocks...") else: st.info(""" 👋 **Welcome to LinkedIn AI Analyzer!** **Powered by Mistral 7B AI** **To get started:** 1. Select content type 2. Enter a LinkedIn URL or click a suggested company 3. Click "Extract & Analyze" 4. Chat with AI about the extracted content **Supported URLs:** - 👤 Public Profiles - 🏢 Company Pages - 📝 Public Posts **AI Features:** - Smart content analysis - Conversational chat - Data insights - Content summarization """) with col2: st.markdown("### 💬 AI Chat Analysis") if st.session_state.chatbot and st.session_state.extracted_data: # Display chat history for i, chat in enumerate(st.session_state.chat_history): if chat["role"] == "user": st.markdown(f"**👤 You:** {chat['content']}") elif chat["role"] == "assistant": st.markdown(f"**🤖 AI:** {chat['content']}") # Chat input user_input = st.chat_input("Ask about the LinkedIn data...") if user_input: # Add user message st.session_state.chat_history.append({"role": "user", "content": user_input}) # Generate AI response with st.spinner("🤔 Mistral AI is analyzing..."): try: response = st.session_state.chatbot.invoke({"question": user_input}) answer = response.get("answer", "I couldn't generate a response based on the available data.") st.session_state.chat_history.append({"role": "assistant", "content": answer}) st.rerun() except Exception as e: error_msg = f"❌ Error generating response: {str(e)}" st.session_state.chat_history.append({"role": "assistant", "content": error_msg}) st.rerun() # Suggested questions if not st.session_state.chat_history: st.markdown("#### 💡 Suggested Questions") suggestions = [ "Summarize the main information from this page", "What are the key highlights or achievements?", "Analyze the business or professional focus", "What insights can you extract from this content?", "Provide a comprehensive overview" ] for suggestion in suggestions: if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True): st.info(f"💡 Try asking: '{suggestion}'") elif st.session_state.extracted_data: st.info("💬 Start a conversation with the AI assistant") else: st.info("🔍 Extract LinkedIn data to enable AI analysis") # Features section st.markdown("---") st.markdown("### 🚀 Powered by Mistral 7B AI") feature_cols = st.columns(3) with feature_cols[0]: st.markdown(""" **🤖 Advanced AI** - Mistral 7B Instruct model - Intelligent text analysis - Contextual understanding """) with feature_cols[1]: st.markdown(""" **💬 Smart Chat** - Conversational memory - Relevant responses - Data-driven insights """) with feature_cols[2]: st.markdown(""" **🔍 Deep Analysis** - Content summarization - Pattern recognition - Professional insights """) if __name__ == "__main__": main()