Refat81 commited on
Commit
f3a65d7
Β·
verified Β·
1 Parent(s): 89d85c0

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +216 -0
pages/linkedin_extractor.py CHANGED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pages/linkedin_extractor.py
2
+ import streamlit as st
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from langchain_text_splitters import CharacterTextSplitter
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain_core.documents import Document
11
+ from langchain_community.llms import HuggingFaceHub
12
+ import re
13
+ import time
14
+ import os
15
+
16
+ st.set_page_config(
17
+ page_title="LinkedIn AI Analyzer",
18
+ page_icon="πŸ’Ό",
19
+ layout="wide"
20
+ )
21
+
22
+ def get_embeddings():
23
+ try:
24
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
+ return embeddings
26
+ except Exception as e:
27
+ st.error(f"❌ Failed to load embeddings: {e}")
28
+ return None
29
+
30
+ def get_llm():
31
+ try:
32
+ api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
33
+ if not api_key:
34
+ st.error("❌ HuggingFace API Key not found in environment variables")
35
+ return None
36
+
37
+ llm = HuggingFaceHub(
38
+ repo_id="google/flan-t5-large",
39
+ huggingfacehub_api_token=api_key,
40
+ model_kwargs={"temperature": 0.7, "max_length": 500}
41
+ )
42
+ return llm
43
+ except Exception as e:
44
+ st.error(f"❌ HuggingFace error: {e}")
45
+ return None
46
+
47
+ def extract_linkedin_data(url, data_type):
48
+ try:
49
+ headers = {
50
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
51
+ }
52
+
53
+ response = requests.get(url, headers=headers, timeout=15)
54
+ if response.status_code != 200:
55
+ return f"❌ Failed to access page (Status: {response.status_code})"
56
+
57
+ soup = BeautifulSoup(response.text, 'html.parser')
58
+ for script in soup(["script", "style"]):
59
+ script.decompose()
60
+
61
+ text = soup.get_text()
62
+ lines = (line.strip() for line in text.splitlines())
63
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
64
+ text = ' '.join(chunk for chunk in chunks if chunk)
65
+
66
+ paragraphs = text.split('.')
67
+ meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
68
+
69
+ if not meaningful_content:
70
+ return "❌ No meaningful content found."
71
+
72
+ result = f"πŸ”— URL: {url}\n"
73
+ result += "="*50 + "\n\n"
74
+
75
+ for i, content in enumerate(meaningful_content[:10], 1):
76
+ result += f"{i}. {content}\n\n"
77
+
78
+ result += "="*50 + "\n"
79
+ result += f"βœ… Extracted {len(meaningful_content)} content blocks\n"
80
+
81
+ return result
82
+
83
+ except Exception as e:
84
+ return f"❌ Error: {str(e)}"
85
+
86
+ def get_text_chunks(text):
87
+ if not text.strip():
88
+ return []
89
+ splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
90
+ return splitter.split_text(text)
91
+
92
+ def get_vectorstore(text_chunks):
93
+ if not text_chunks:
94
+ return None
95
+ documents = [Document(page_content=chunk) for chunk in text_chunks]
96
+ embeddings = get_embeddings()
97
+ if embeddings is None:
98
+ return None
99
+ vectorstore = FAISS.from_documents(documents, embeddings)
100
+ return vectorstore
101
+
102
+ def get_conversation_chain(vectorstore):
103
+ if vectorstore is None:
104
+ return None
105
+ try:
106
+ llm = get_llm()
107
+ if llm is None:
108
+ return None
109
+
110
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
111
+ chain = ConversationalRetrievalChain.from_llm(
112
+ llm=llm,
113
+ retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
114
+ memory=memory,
115
+ return_source_documents=True
116
+ )
117
+ return chain
118
+ except Exception as e:
119
+ st.error(f"❌ Error: {e}")
120
+ return None
121
+
122
+ def main():
123
+ st.title("πŸ’Ό LinkedIn AI Analyzer")
124
+
125
+ if st.button("← Back to Main Dashboard"):
126
+ st.switch_page("app.py")
127
+
128
+ # Initialize session state
129
+ if "conversation" not in st.session_state:
130
+ st.session_state.conversation = None
131
+ if "chat_history" not in st.session_state:
132
+ st.session_state.chat_history = []
133
+ if "processed" not in st.session_state:
134
+ st.session_state.processed = False
135
+ if "extracted_data" not in st.session_state:
136
+ st.session_state.extracted_data = ""
137
+
138
+ # Sidebar
139
+ with st.sidebar:
140
+ data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
141
+
142
+ url_placeholder = {
143
+ "profile": "https://www.linkedin.com/in/username/",
144
+ "company": "https://www.linkedin.com/company/companyname/",
145
+ "post": "https://www.linkedin.com/posts/username_postid/"
146
+ }
147
+
148
+ linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
149
+
150
+ if st.button("πŸš€ Extract & Analyze", type="primary"):
151
+ if not linkedin_url.strip():
152
+ st.warning("Please enter a LinkedIn URL")
153
+ else:
154
+ with st.spinner("πŸ”„ Extracting data..."):
155
+ extracted_data = extract_linkedin_data(linkedin_url, data_type)
156
+
157
+ if extracted_data and not extracted_data.startswith("❌"):
158
+ chunks = get_text_chunks(extracted_data)
159
+ if chunks:
160
+ vectorstore = get_vectorstore(chunks)
161
+ conversation = get_conversation_chain(vectorstore)
162
+ if conversation:
163
+ st.session_state.conversation = conversation
164
+ st.session_state.processed = True
165
+ st.session_state.extracted_data = extracted_data
166
+ st.session_state.chat_history = []
167
+ st.success(f"βœ… Ready to analyze {len(chunks)} content chunks!")
168
+ else:
169
+ st.error("❌ Failed to initialize AI")
170
+ else:
171
+ st.error("❌ No content extracted")
172
+ else:
173
+ st.error(extracted_data)
174
+
175
+ # Main content
176
+ col1, col2 = st.columns([2, 1])
177
+
178
+ with col1:
179
+ st.markdown("### πŸ’¬ Chat")
180
+
181
+ for i, chat in enumerate(st.session_state.chat_history):
182
+ if chat["role"] == "user":
183
+ st.markdown(f"**πŸ‘€ You:** {chat['content']}")
184
+ elif chat["role"] == "assistant":
185
+ if chat["content"]:
186
+ st.markdown(f"**πŸ€– Assistant:** {chat['content']}")
187
+
188
+ if st.session_state.processed:
189
+ user_input = st.chat_input("Ask about the LinkedIn data...")
190
+ if user_input:
191
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
192
+ with st.spinner("πŸ€” Analyzing..."):
193
+ try:
194
+ if st.session_state.conversation:
195
+ response = st.session_state.conversation.invoke({"question": user_input})
196
+ answer = response.get("answer", "No response generated.")
197
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
198
+ st.rerun()
199
+ except Exception as e:
200
+ st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
201
+ st.rerun()
202
+ else:
203
+ st.info("πŸ‘‹ Enter a LinkedIn URL and click 'Extract & Analyze' to start")
204
+
205
+ with col2:
206
+ if st.session_state.processed:
207
+ st.markdown("### πŸ“Š Overview")
208
+ data = st.session_state.extracted_data
209
+ chunks = get_text_chunks(data)
210
+
211
+ st.metric("Content Type", data_type.title())
212
+ st.metric("Text Chunks", len(chunks))
213
+ st.metric("Characters", f"{len(data):,}")
214
+
215
+ if __name__ == "__main__":
216
+ main()