Daksh0505 commited on
Commit
5765f2e
Β·
verified Β·
1 Parent(s): deca23d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +280 -86
app.py CHANGED
@@ -1,116 +1,310 @@
1
  # ---------------- Imports ----------------
2
- import streamlit as st
3
  import torch
4
  import numpy as np
5
  import tensorflow as tf
6
  import tensorflow_hub as hub
7
  from transformers import BertTokenizer, BertModel
 
 
8
  import pandas as pd
 
9
  import io
10
 
11
- # ---------------- Models ----------------
12
  model_options = {
13
  "BERT Large Uncased": "bert-large-uncased",
14
  "BERT Large Cased": "bert-large-cased",
15
  "BERT Base Uncased": "bert-base-uncased",
16
- "BERT Base Cased": "bert-base-cased",
17
- "ELMo": "elmo"
18
  }
19
 
20
- # Load ELMo once
 
 
 
21
  elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", trainable=False)
22
 
23
- # Globals
 
 
 
 
 
 
 
 
24
  current_bert_model = None
25
  current_tokenizer = None
26
 
27
- # ---------------- Functions ----------------
28
  def load_bert_model(model_name):
29
- """Load HuggingFace BERT model."""
30
  global current_bert_model, current_tokenizer
31
- current_tokenizer = BertTokenizer.from_pretrained(model_name)
32
- current_bert_model = BertModel.from_pretrained(model_name)
33
- current_bert_model.eval()
34
- return f"βœ… Loaded {model_name}"
35
-
36
- def get_embeddings(sentences, model_choice):
37
- embeddings = []
38
- if model_choice == "ELMo":
39
- # ELMo embeddings
40
- inputs = tf.convert_to_tensor(sentences, dtype=tf.string)
41
- elmo_out = elmo(inputs)
42
- elmo_emb = elmo_out['default'] if isinstance(elmo_out, dict) else elmo_out
43
- embeddings = elmo_emb.numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  else:
45
- # BERT embeddings
46
- if current_bert_model is None or current_tokenizer is None:
47
- load_bert_model(model_options[model_choice])
48
- inputs = current_tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
49
- with torch.no_grad():
50
- bert_out = current_bert_model(**inputs)
51
- token_embeddings = bert_out.last_hidden_state
52
- attention_mask = inputs['attention_mask'].unsqueeze(-1)
53
- masked_embeddings = token_embeddings * attention_mask
54
- bert_emb = (masked_embeddings.sum(1) / attention_mask.sum(1)).numpy()
55
- embeddings = bert_emb
56
- return embeddings
57
-
58
- def export_embeddings(sentences, embeddings):
59
- """Return CSV file with sentences and embeddings"""
60
- df = pd.DataFrame(embeddings)
61
- df.insert(0, 'sentence', sentences)
62
- csv_bytes = df.to_csv(index=False).encode('utf-8')
63
- return io.BytesIO(csv_bytes)
64
-
65
- # ---------------- Streamlit App ----------------
66
- st.title("πŸ€– Sentence Embeddings Generator (BERT + ELMo)")
67
-
68
- # Upload CSV or add sentences
69
- uploaded_file = st.file_uploader("Upload CSV with 'sentence' column (optional)", type=["csv"])
70
- manual_input = st.text_area("Or enter sentences manually (one per line)")
71
-
72
- sentences = []
73
- if uploaded_file:
74
- df = pd.read_csv(uploaded_file)
75
- if 'sentence' not in df.columns:
76
- st.error("CSV must have a column named 'sentence'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  else:
78
- sentences = df['sentence'].dropna().tolist()
 
 
 
 
79
 
80
- if manual_input:
81
- manual_sentences = [s.strip() for s in manual_input.splitlines() if s.strip()]
82
- sentences.extend(manual_sentences)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- if sentences:
85
- st.success(f"βœ… {len(sentences)} sentences ready for embedding generation")
86
 
87
- # Model selection
88
- model_choice = st.selectbox("Select model", list(model_options.keys()))
 
 
 
 
 
89
 
90
- # Generate embeddings button
91
- if st.button("Generate Embeddings"):
92
- if not sentences:
93
- st.warning("Please add sentences or upload CSV first.")
94
- else:
95
- with st.spinner("Generating embeddings..."):
96
- embeddings = get_embeddings(sentences, model_choice)
97
- st.success(f"βœ… Generated embeddings for {len(sentences)} sentences")
98
-
99
- # Show shape of first few embeddings
100
- st.write(f"Embedding shape: {embeddings.shape}")
101
- st.write("First 2 sentences and their embeddings (truncated):")
102
- for i, sent in enumerate(sentences[:2]):
103
- st.write(f"{i+1}: {sent}")
104
- st.write(embeddings[i][:10], "...") # show first 10 dims only
105
-
106
- # Prepare CSV for download
107
- csv_file = export_embeddings(sentences, embeddings)
108
- st.download_button(
109
- label="πŸ“₯ Download CSV with Embeddings",
110
- data=csv_file,
111
- file_name=f"{model_choice.replace(' ','_')}_embeddings.csv"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Clear button
115
- if st.button("Clear"):
116
- st.experimental_rerun()
 
1
  # ---------------- Imports ----------------
 
2
  import torch
3
  import numpy as np
4
  import tensorflow as tf
5
  import tensorflow_hub as hub
6
  from transformers import BertTokenizer, BertModel
7
+ import gradio as gr
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
  import pandas as pd
10
+ import json
11
  import io
12
 
13
+ # ---------------- Load models once ----------------
14
  model_options = {
15
  "BERT Large Uncased": "bert-large-uncased",
16
  "BERT Large Cased": "bert-large-cased",
17
  "BERT Base Uncased": "bert-base-uncased",
18
+ "BERT Base Cased": "bert-base-cased"
 
19
  }
20
 
21
+ # Default model
22
+ current_model_name = "bert-large-uncased"
23
+
24
+ # Load ELMo (TF Hub)
25
  elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", trainable=False)
26
 
27
+ # Load BERT (HuggingFace Transformers) - will be reloaded when model changes
28
+ tokenizer = BertTokenizer.from_pretrained(current_model_name)
29
+ bert_model = BertModel.from_pretrained(current_model_name)
30
+ bert_model.eval() # disable training mode
31
+
32
+ # Global variables to store embeddings as matrices
33
+ bert_embeddings_matrix = None
34
+ elmo_embeddings_matrix = None
35
+ sentences_storage = []
36
  current_bert_model = None
37
  current_tokenizer = None
38
 
 
39
  def load_bert_model(model_name):
40
+ """Load BERT model and tokenizer"""
41
  global current_bert_model, current_tokenizer
42
+ try:
43
+ current_tokenizer = BertTokenizer.from_pretrained(model_name)
44
+ current_bert_model = BertModel.from_pretrained(model_name)
45
+ current_bert_model.eval()
46
+ return f"βœ… Loaded {model_name}"
47
+ except Exception as e:
48
+ return f"❌ Error loading {model_name}: {str(e)}"
49
+
50
+ # Initialize with default model
51
+ load_bert_model(current_model_name)
52
+
53
+ # ---------------- Single sentence embedding function ----------------
54
+ def get_single_embedding(sentence):
55
+ """Get BERT and ELMo embeddings for a single sentence"""
56
+ global current_bert_model, current_tokenizer
57
+
58
+ # ------------ BERT ------------ #
59
+ input_bert = current_tokenizer([sentence], return_tensors="pt", padding=True, truncation=True)
60
+ with torch.no_grad():
61
+ bert_output = current_bert_model(**input_bert) # [1, seq_len, hidden_size]
62
+ token_embeddings = bert_output.last_hidden_state # tensor: (1, seq_len, 1024 for large)
63
+
64
+ attention_mask = input_bert['attention_mask'].unsqueeze(-1) # (1, seq_len, 1)
65
+ masked_embeddings = token_embeddings * attention_mask
66
+ bert_embedding = masked_embeddings.sum(1) / attention_mask.sum(1) # mean pooling β†’ (1, hidden_size)
67
+ bert_embedding = bert_embedding.squeeze(0).numpy() # Remove batch dimension and convert to numpy
68
+
69
+ # ------------ ELMo ------------ #
70
+ input_elmo = tf.convert_to_tensor([sentence], dtype=tf.string)
71
+ elmo_emb = elmo(input_elmo) # Default output is sentence-level embedding
72
+
73
+ # ELMo typically returns a dictionary with different outputs, get the default embedding
74
+ if isinstance(elmo_emb, dict):
75
+ elmo_embedding = elmo_emb['default'] # or try 'elmo' key
76
  else:
77
+ elmo_embedding = elmo_emb
78
+
79
+ elmo_embedding = elmo_embedding.numpy().squeeze() # Convert to numpy and remove extra dimensions
80
+
81
+ return bert_embedding, elmo_embedding
82
+
83
+ def change_bert_model(model_choice):
84
+ """Change BERT model and clear existing embeddings"""
85
+ global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
86
+
87
+ model_name = model_options[model_choice]
88
+ status = load_bert_model(model_name)
89
+
90
+ # Clear existing embeddings since we changed the model
91
+ bert_embeddings_matrix = None
92
+ elmo_embeddings_matrix = None
93
+ sentences_storage = []
94
+
95
+ clear_status = "πŸ”„ Model changed! Previous embeddings cleared. Please add sentences again."
96
+ return status, clear_status, "πŸ“ No sentences added yet. Please add at least 2 sentences."
97
+
98
+ # ---------------- Add sentence function ----------------
99
+ def add_sentence(sentence):
100
+ """Add a sentence and compute its embeddings"""
101
+ global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
102
+
103
+ if not sentence.strip():
104
+ return "Please enter a valid sentence.", get_current_status()
105
+
106
+ sentence = sentence.strip()
107
+
108
+ try:
109
+ # Get embeddings for this sentence
110
+ bert_emb, elmo_emb = get_single_embedding(sentence)
111
+
112
+ # Add to matrices row by row
113
+ if bert_embeddings_matrix is None:
114
+ # First sentence - initialize matrices
115
+ bert_embeddings_matrix = bert_emb.reshape(1, -1) # Make it 2D [1, features]
116
+ elmo_embeddings_matrix = elmo_emb.reshape(1, -1) # Make it 2D [1, features]
117
+ else:
118
+ # Add as new row using vstack
119
+ bert_embeddings_matrix = np.vstack([bert_embeddings_matrix, bert_emb.reshape(1, -1)])
120
+ elmo_embeddings_matrix = np.vstack([elmo_embeddings_matrix, elmo_emb.reshape(1, -1)])
121
+
122
+ # Store sentence
123
+ sentences_storage.append(sentence)
124
+
125
+ return f"βœ“ Added sentence {len(sentences_storage)}: '{sentence}'", get_current_status()
126
+
127
+ except Exception as e:
128
+ return f"❌ Error processing sentence: {str(e)}", get_current_status()
129
+
130
+ # ---------------- Get current status ----------------
131
+ def get_current_status():
132
+ """Return current status of stored sentences"""
133
+ if len(sentences_storage) == 0:
134
+ return "πŸ“ No sentences added yet. Please add at least 2 sentences."
135
+ elif len(sentences_storage) == 1:
136
+ return f"πŸ“ Current sentences ({len(sentences_storage)}/2 minimum):\n1: {sentences_storage[0]}\n\nβž• Add at least 1 more sentence to compute similarity."
137
  else:
138
+ status = f"πŸ“ Current sentences ({len(sentences_storage)}):\n"
139
+ for i, sent in enumerate(sentences_storage):
140
+ status += f"{i+1}: {sent}\n"
141
+ status += f"\nβœ… Ready to compute similarity!"
142
+ return status
143
 
144
+ # ---------------- Compute similarity ----------------
145
+ def compute_similarity():
146
+ """Compute similarity matrices for stored embeddings"""
147
+ global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
148
+
149
+ if len(sentences_storage) < 2:
150
+ return "⚠️ Please add at least 2 sentences before computing similarity."
151
+
152
+ try:
153
+ # Convert to torch tensors for torch.cosine_similarity
154
+ bert_tensor = torch.tensor(bert_embeddings_matrix, dtype=torch.float32)
155
+ elmo_tensor = torch.tensor(elmo_embeddings_matrix, dtype=torch.float32)
156
+
157
+ # Compute pairwise cosine similarity using torch
158
+ def torch_pairwise_cosine_similarity(X):
159
+ # Normalize vectors
160
+ X_norm = torch.nn.functional.normalize(X, p=2, dim=1)
161
+ # Compute similarity matrix
162
+ return torch.mm(X_norm, X_norm.t())
163
+
164
+ bert_sim_torch = torch_pairwise_cosine_similarity(bert_tensor)
165
+ elmo_sim_torch = torch_pairwise_cosine_similarity(elmo_tensor)
166
+
167
+ # Convert back to numpy for display
168
+ bert_sim = bert_sim_torch.numpy()
169
+ elmo_sim = elmo_sim_torch.numpy()
170
+
171
+ # Alternative: Use sklearn for comparison
172
+ bert_sim_sklearn = cosine_similarity(bert_embeddings_matrix)
173
+ elmo_sim_sklearn = cosine_similarity(elmo_embeddings_matrix)
174
+
175
+ # Format output
176
+ result = f"πŸ” Similarity Analysis for {len(sentences_storage)} sentences:\n\n"
177
+
178
+ result += "πŸ€– BERT Similarity Matrix (PyTorch):\n"
179
+ result += f"{np.round(bert_sim, 3)}\n\n"
180
+
181
+ result += "🧠 ELMo Similarity Matrix (PyTorch):\n"
182
+ result += f"{np.round(elmo_sim, 3)}\n\n"
183
+
184
+ # Show comparison with sklearn (optional)
185
+ result += "πŸ“Š Comparison Check:\n"
186
+ result += f"BERT torch vs sklearn max diff: {np.max(np.abs(bert_sim - bert_sim_sklearn)):.6f}\n"
187
+ result += f"ELMo torch vs sklearn max diff: {np.max(np.abs(elmo_sim - elmo_sim_sklearn)):.6f}\n\n"
188
+
189
+ result += "πŸ“„ Sentences Reference:\n"
190
+ for i, sentence in enumerate(sentences_storage):
191
+ result += f"{i+1}: {sentence}\n"
192
+
193
+ # Add matrix shapes info
194
+ result += f"\nπŸ“Š Matrix Details:\n"
195
+ result += f"BERT embeddings shape: {bert_embeddings_matrix.shape}\n"
196
+ result += f"ELMo embeddings shape: {elmo_embeddings_matrix.shape}\n"
197
+ result += f"Similarity matrices shape: {bert_sim.shape}"
198
+
199
+ return result
200
+
201
+ except Exception as e:
202
+ return f"❌ Error computing similarity: {str(e)}"
203
 
 
 
204
 
205
+ def clear_all():
206
+ """Clear all stored sentences and embeddings"""
207
+ global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
208
+ bert_embeddings_matrix = None
209
+ elmo_embeddings_matrix = None
210
+ sentences_storage = []
211
+ return "πŸ—‘οΈ All sentences cleared.", "πŸ“ No sentences added yet. Please add at least 2 sentences."
212
 
213
+ # ---------------- Gradio Interface ----------------
214
+ with gr.Blocks(title="BERT + ELMo Sentence Similarity", theme=gr.themes.Soft()) as iface:
215
+ gr.Markdown("# πŸ€– BERT + ELMo Sentence Similarity Analyzer")
216
+ gr.Markdown("Add sentences one by one (minimum 2) and compute pairwise similarity using BERT and ELMo embeddings.")
217
+
218
+ # Model selection section
219
+ with gr.Row():
220
+ with gr.Column(scale=1):
221
+ model_dropdown = gr.Dropdown(
222
+ choices=list(model_options.keys()),
223
+ value="BERT Large Uncased",
224
+ label="πŸ”§ Select BERT Model",
225
+ info="Choose between cased/uncased and base/large variants"
226
+ )
227
+ model_status = gr.Textbox(
228
+ label="πŸ“‹ Model Status",
229
+ value="βœ… Loaded bert-large-uncased",
230
+ lines=1,
231
+ interactive=False
232
+ )
233
+
234
+ with gr.Row():
235
+ with gr.Column(scale=2):
236
+ sentence_input = gr.Textbox(
237
+ label="Enter a sentence",
238
+ placeholder="Type your sentence here... (e.g., 'I love machine learning')",
239
+ lines=2
240
+ )
241
+ with gr.Row():
242
+ add_btn = gr.Button("βž• Add Sentence", variant="primary", size="lg")
243
+ compute_btn = gr.Button("πŸ” Compute Similarity", variant="secondary", size="lg")
244
+ clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="stop", size="lg")
245
+
246
+ with gr.Column(scale=1):
247
+ status_output = gr.Textbox(
248
+ label="πŸ“‹ Current Status",
249
+ value="πŸ“ No sentences added yet. Please add at least 2 sentences.",
250
+ lines=8,
251
+ interactive=False
252
+ )
253
+
254
+ with gr.Row():
255
+ result_output = gr.Textbox(
256
+ label="πŸ“Š Similarity Results",
257
+ lines=20,
258
+ interactive=False,
259
+ show_copy_button=True
260
+ )
261
+
262
+
263
  )
264
+
265
+ gr.Markdown("""
266
+ ### πŸ“– How to use:
267
+ 1. **Choose Model**: Select your preferred BERT variant (uncased recommended for similarity)
268
+ 2. **Add sentences**: Type a sentence and click "Add Sentence"
269
+ 3. **Repeat**: Add at least 2 sentences (you can add more!)
270
+ 4. **Compute**: Click "Compute Similarity" to see the results
271
+ 5. **Export**: Download embeddings and similarity matrices for further analysis
272
+ 6. **Interpret**: Values closer to 1.0 indicate higher similarity
273
+
274
+ ### πŸ”¬ Models:
275
+ - **BERT Large Uncased**: Best for semantic similarity (recommended) - 1024 dimensions
276
+ - **BERT Large Cased**: Preserves capitalization, good for proper nouns - 1024 dimensions
277
+ - **BERT Base Uncased**: Faster, smaller model - 768 dimensions
278
+ - **BERT Base Cased**: Cased version of base model - 768 dimensions
279
+ - **ELMo**: Contextual word representations using LSTM - 1024 dimensions
280
+ """)
281
+
282
+ # Event handlers
283
+ model_dropdown.change(
284
+ fn=change_bert_model,
285
+ inputs=[model_dropdown],
286
+ outputs=[model_status, result_output, status_output]
287
+ )
288
+
289
+ add_btn.click(
290
+ fn=add_sentence,
291
+ inputs=[sentence_input],
292
+ outputs=[result_output, status_output]
293
+ ).then(
294
+ lambda: "", # Clear input after adding
295
+ outputs=[sentence_input]
296
+ )
297
+
298
+ compute_btn.click(
299
+ fn=compute_similarity,
300
+ outputs=[result_output]
301
+ )
302
+
303
+ clear_btn.click(
304
+ fn=clear_all,
305
+ outputs=[result_output, status_output]
306
+ )
307
+
308
 
309
+ if __name__ == "__main__":
310
+ iface.launch(share=True)