Refat81 commited on
Commit
89d85c0
·
verified ·
1 Parent(s): b1d820a

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +0 -216
pages/linkedin_extractor.py CHANGED
@@ -1,216 +0,0 @@
1
- # pages/linkedin_extractor.py
2
- import streamlit as st
3
- import requests
4
- from bs4 import BeautifulSoup
5
- from langchain_text_splitters import CharacterTextSplitter
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from langchain_community.vectorstores import FAISS
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- from langchain_core.documents import Document
11
- from langchain_community.llms import HuggingFaceHub
12
- import re
13
- import time
14
- import os
15
-
16
- st.set_page_config(
17
- page_title="LinkedIn AI Analyzer",
18
- page_icon="💼",
19
- layout="wide"
20
- )
21
-
22
- def get_embeddings():
23
- try:
24
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
- return embeddings
26
- except Exception as e:
27
- st.error(f"❌ Failed to load embeddings: {e}")
28
- return None
29
-
30
- def get_llm():
31
- try:
32
- api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
33
- if not api_key:
34
- st.error("❌ HuggingFace API Key not found in environment variables")
35
- return None
36
-
37
- llm = HuggingFaceHub(
38
- repo_id="google/flan-t5-large",
39
- huggingfacehub_api_token=api_key,
40
- model_kwargs={"temperature": 0.7, "max_length": 500}
41
- )
42
- return llm
43
- except Exception as e:
44
- st.error(f"❌ HuggingFace error: {e}")
45
- return None
46
-
47
- def extract_linkedin_data(url, data_type):
48
- try:
49
- headers = {
50
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
51
- }
52
-
53
- response = requests.get(url, headers=headers, timeout=15)
54
- if response.status_code != 200:
55
- return f"❌ Failed to access page (Status: {response.status_code})"
56
-
57
- soup = BeautifulSoup(response.text, 'html.parser')
58
- for script in soup(["script", "style"]):
59
- script.decompose()
60
-
61
- text = soup.get_text()
62
- lines = (line.strip() for line in text.splitlines())
63
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
64
- text = ' '.join(chunk for chunk in chunks if chunk)
65
-
66
- paragraphs = text.split('.')
67
- meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
68
-
69
- if not meaningful_content:
70
- return "❌ No meaningful content found."
71
-
72
- result = f"🔗 URL: {url}\n"
73
- result += "="*50 + "\n\n"
74
-
75
- for i, content in enumerate(meaningful_content[:10], 1):
76
- result += f"{i}. {content}\n\n"
77
-
78
- result += "="*50 + "\n"
79
- result += f"✅ Extracted {len(meaningful_content)} content blocks\n"
80
-
81
- return result
82
-
83
- except Exception as e:
84
- return f"❌ Error: {str(e)}"
85
-
86
- def get_text_chunks(text):
87
- if not text.strip():
88
- return []
89
- splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
90
- return splitter.split_text(text)
91
-
92
- def get_vectorstore(text_chunks):
93
- if not text_chunks:
94
- return None
95
- documents = [Document(page_content=chunk) for chunk in text_chunks]
96
- embeddings = get_embeddings()
97
- if embeddings is None:
98
- return None
99
- vectorstore = FAISS.from_documents(documents, embeddings)
100
- return vectorstore
101
-
102
- def get_conversation_chain(vectorstore):
103
- if vectorstore is None:
104
- return None
105
- try:
106
- llm = get_llm()
107
- if llm is None:
108
- return None
109
-
110
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
111
- chain = ConversationalRetrievalChain.from_llm(
112
- llm=llm,
113
- retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
114
- memory=memory,
115
- return_source_documents=True
116
- )
117
- return chain
118
- except Exception as e:
119
- st.error(f"❌ Error: {e}")
120
- return None
121
-
122
- def main():
123
- st.title("💼 LinkedIn AI Analyzer")
124
-
125
- if st.button("← Back to Main Dashboard"):
126
- st.switch_page("app.py")
127
-
128
- # Initialize session state
129
- if "conversation" not in st.session_state:
130
- st.session_state.conversation = None
131
- if "chat_history" not in st.session_state:
132
- st.session_state.chat_history = []
133
- if "processed" not in st.session_state:
134
- st.session_state.processed = False
135
- if "extracted_data" not in st.session_state:
136
- st.session_state.extracted_data = ""
137
-
138
- # Sidebar
139
- with st.sidebar:
140
- data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
141
-
142
- url_placeholder = {
143
- "profile": "https://www.linkedin.com/in/username/",
144
- "company": "https://www.linkedin.com/company/companyname/",
145
- "post": "https://www.linkedin.com/posts/username_postid/"
146
- }
147
-
148
- linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
149
-
150
- if st.button("🚀 Extract & Analyze", type="primary"):
151
- if not linkedin_url.strip():
152
- st.warning("Please enter a LinkedIn URL")
153
- else:
154
- with st.spinner("🔄 Extracting data..."):
155
- extracted_data = extract_linkedin_data(linkedin_url, data_type)
156
-
157
- if extracted_data and not extracted_data.startswith("❌"):
158
- chunks = get_text_chunks(extracted_data)
159
- if chunks:
160
- vectorstore = get_vectorstore(chunks)
161
- conversation = get_conversation_chain(vectorstore)
162
- if conversation:
163
- st.session_state.conversation = conversation
164
- st.session_state.processed = True
165
- st.session_state.extracted_data = extracted_data
166
- st.session_state.chat_history = []
167
- st.success(f"✅ Ready to analyze {len(chunks)} content chunks!")
168
- else:
169
- st.error("❌ Failed to initialize AI")
170
- else:
171
- st.error("❌ No content extracted")
172
- else:
173
- st.error(extracted_data)
174
-
175
- # Main content
176
- col1, col2 = st.columns([2, 1])
177
-
178
- with col1:
179
- st.markdown("### 💬 Chat")
180
-
181
- for i, chat in enumerate(st.session_state.chat_history):
182
- if chat["role"] == "user":
183
- st.markdown(f"**👤 You:** {chat['content']}")
184
- elif chat["role"] == "assistant":
185
- if chat["content"]:
186
- st.markdown(f"**🤖 Assistant:** {chat['content']}")
187
-
188
- if st.session_state.processed:
189
- user_input = st.chat_input("Ask about the LinkedIn data...")
190
- if user_input:
191
- st.session_state.chat_history.append({"role": "user", "content": user_input})
192
- with st.spinner("🤔 Analyzing..."):
193
- try:
194
- if st.session_state.conversation:
195
- response = st.session_state.conversation.invoke({"question": user_input})
196
- answer = response.get("answer", "No response generated.")
197
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
198
- st.rerun()
199
- except Exception as e:
200
- st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
201
- st.rerun()
202
- else:
203
- st.info("👋 Enter a LinkedIn URL and click 'Extract & Analyze' to start")
204
-
205
- with col2:
206
- if st.session_state.processed:
207
- st.markdown("### 📊 Overview")
208
- data = st.session_state.extracted_data
209
- chunks = get_text_chunks(data)
210
-
211
- st.metric("Content Type", data_type.title())
212
- st.metric("Text Chunks", len(chunks))
213
- st.metric("Characters", f"{len(data):,}")
214
-
215
- if __name__ == "__main__":
216
- main()