File size: 8,190 Bytes
f3a65d7
 
 
6af20e8
 
 
1a8b1d8
 
6af20e8
 
f3a65d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import streamlit as st
import requests
from bs4 import BeautifulSoup
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_core.documents import Document
from langchain_community.llms import HuggingFaceHub
import re
import time
import os

st.set_page_config(
    page_title="LinkedIn AI Analyzer",
    page_icon="πŸ’Ό",
    layout="wide"
)

def get_embeddings():
    try:
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        return embeddings
    except Exception as e:
        st.error(f"❌ Failed to load embeddings: {e}")
        return None

def get_llm():
    try:
        api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
        if not api_key:
            st.error("❌ HuggingFace API Key not found in environment variables")
            return None
        
        llm = HuggingFaceHub(
            repo_id="google/flan-t5-large",
            huggingfacehub_api_token=api_key,
            model_kwargs={"temperature": 0.7, "max_length": 500}
        )
        return llm
    except Exception as e:
        st.error(f"❌ HuggingFace error: {e}")
        return None

def extract_linkedin_data(url, data_type):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code != 200:
            return f"❌ Failed to access page (Status: {response.status_code})"
        
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(["script", "style"]):
            script.decompose()
        
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        paragraphs = text.split('.')
        meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
        
        if not meaningful_content:
            return "❌ No meaningful content found."
        
        result = f"πŸ”— URL: {url}\n"
        result += "="*50 + "\n\n"
        
        for i, content in enumerate(meaningful_content[:10], 1):
            result += f"{i}. {content}\n\n"
        
        result += "="*50 + "\n"
        result += f"βœ… Extracted {len(meaningful_content)} content blocks\n"
        
        return result
        
    except Exception as e:
        return f"❌ Error: {str(e)}"

def get_text_chunks(text):
    if not text.strip():
        return []
    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
    return splitter.split_text(text)

def get_vectorstore(text_chunks):
    if not text_chunks:
        return None
    documents = [Document(page_content=chunk) for chunk in text_chunks]
    embeddings = get_embeddings()
    if embeddings is None:
        return None
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

def get_conversation_chain(vectorstore):
    if vectorstore is None:
        return None
    try:
        llm = get_llm()
        if llm is None:
            return None
        
        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
            memory=memory,
            return_source_documents=True
        )
        return chain
    except Exception as e:
        st.error(f"❌ Error: {e}")
        return None

def main():
    st.title("πŸ’Ό LinkedIn AI Analyzer")
    
    if st.button("← Back to Main Dashboard"):
        st.switch_page("app.py")
    
    # Initialize session state
    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []
    if "processed" not in st.session_state:
        st.session_state.processed = False
    if "extracted_data" not in st.session_state:
        st.session_state.extracted_data = ""
    
    # Sidebar
    with st.sidebar:
        data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
        
        url_placeholder = {
            "profile": "https://www.linkedin.com/in/username/",
            "company": "https://www.linkedin.com/company/companyname/", 
            "post": "https://www.linkedin.com/posts/username_postid/"
        }
        
        linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
        
        if st.button("πŸš€ Extract & Analyze", type="primary"):
            if not linkedin_url.strip():
                st.warning("Please enter a LinkedIn URL")
            else:
                with st.spinner("πŸ”„ Extracting data..."):
                    extracted_data = extract_linkedin_data(linkedin_url, data_type)
                    
                    if extracted_data and not extracted_data.startswith("❌"):
                        chunks = get_text_chunks(extracted_data)
                        if chunks:
                            vectorstore = get_vectorstore(chunks)
                            conversation = get_conversation_chain(vectorstore)
                            if conversation:
                                st.session_state.conversation = conversation
                                st.session_state.processed = True
                                st.session_state.extracted_data = extracted_data
                                st.session_state.chat_history = []
                                st.success(f"βœ… Ready to analyze {len(chunks)} content chunks!")
                            else:
                                st.error("❌ Failed to initialize AI")
                        else:
                            st.error("❌ No content extracted")
                    else:
                        st.error(extracted_data)
    
    # Main content
    col1, col2 = st.columns([2, 1])
    
    with col1:
        st.markdown("### πŸ’¬ Chat")
        
        for i, chat in enumerate(st.session_state.chat_history):
            if chat["role"] == "user":
                st.markdown(f"**πŸ‘€ You:** {chat['content']}")
            elif chat["role"] == "assistant":
                if chat["content"]:
                    st.markdown(f"**πŸ€– Assistant:** {chat['content']}")
        
        if st.session_state.processed:
            user_input = st.chat_input("Ask about the LinkedIn data...")
            if user_input:
                st.session_state.chat_history.append({"role": "user", "content": user_input})
                with st.spinner("πŸ€” Analyzing..."):
                    try:
                        if st.session_state.conversation:
                            response = st.session_state.conversation.invoke({"question": user_input})
                            answer = response.get("answer", "No response generated.")
                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
                            st.rerun()
                    except Exception as e:
                        st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
                        st.rerun()
        else:
            st.info("πŸ‘‹ Enter a LinkedIn URL and click 'Extract & Analyze' to start")
    
    with col2:
        if st.session_state.processed:
            st.markdown("### πŸ“Š Overview")
            data = st.session_state.extracted_data
            chunks = get_text_chunks(data)
            
            st.metric("Content Type", data_type.title())
            st.metric("Text Chunks", len(chunks))
            st.metric("Characters", f"{len(data):,}")

if __name__ == "__main__":
    main()