Spaces:

Daksh0505
/

bert-elmo-similarity-analyzer

Sleeping

App Files Files Community

Daksh0505 commited on Sep 21

Commit

5765f2e

verified ·

1 Parent(s): deca23d

Update app.py

Browse files

Files changed (1) hide show

app.py +280 -86

app.py CHANGED Viewed

@@ -1,116 +1,310 @@
 # ---------------- Imports ----------------
-import streamlit as st
 import torch
 import numpy as np
 import tensorflow as tf
 import tensorflow_hub as hub
 from transformers import BertTokenizer, BertModel
 import pandas as pd
 import io
-# ---------------- Models ----------------
 model_options = {
     "BERT Large Uncased": "bert-large-uncased",
     "BERT Large Cased": "bert-large-cased",
     "BERT Base Uncased": "bert-base-uncased",
-    "BERT Base Cased": "bert-base-cased",
-    "ELMo": "elmo"
 }
-# Load ELMo once
 elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", trainable=False)
-# Globals
 current_bert_model = None
 current_tokenizer = None
-# ---------------- Functions ----------------
 def load_bert_model(model_name):
-    """Load HuggingFace BERT model."""
     global current_bert_model, current_tokenizer
-    current_tokenizer = BertTokenizer.from_pretrained(model_name)
-    current_bert_model = BertModel.from_pretrained(model_name)
-    current_bert_model.eval()
-    return f"✅ Loaded {model_name}"
-def get_embeddings(sentences, model_choice):
-    embeddings = []
-    if model_choice == "ELMo":
-        # ELMo embeddings
-        inputs = tf.convert_to_tensor(sentences, dtype=tf.string)
-        elmo_out = elmo(inputs)
-        elmo_emb = elmo_out['default'] if isinstance(elmo_out, dict) else elmo_out
-        embeddings = elmo_emb.numpy()
     else:
-        # BERT embeddings
-        if current_bert_model is None or current_tokenizer is None:
-            load_bert_model(model_options[model_choice])
-        inputs = current_tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
-        with torch.no_grad():
-            bert_out = current_bert_model(**inputs)
-            token_embeddings = bert_out.last_hidden_state
-        attention_mask = inputs['attention_mask'].unsqueeze(-1)
-        masked_embeddings = token_embeddings * attention_mask
-        bert_emb = (masked_embeddings.sum(1) / attention_mask.sum(1)).numpy()
-        embeddings = bert_emb
-    return embeddings
-def export_embeddings(sentences, embeddings):
-    """Return CSV file with sentences and embeddings"""
-    df = pd.DataFrame(embeddings)
-    df.insert(0, 'sentence', sentences)
-    csv_bytes = df.to_csv(index=False).encode('utf-8')
-    return io.BytesIO(csv_bytes)
-# ---------------- Streamlit App ----------------
-st.title("🤖 Sentence Embeddings Generator (BERT + ELMo)")
-# Upload CSV or add sentences
-uploaded_file = st.file_uploader("Upload CSV with 'sentence' column (optional)", type=["csv"])
-manual_input = st.text_area("Or enter sentences manually (one per line)")
-sentences = []
-if uploaded_file:
-    df = pd.read_csv(uploaded_file)
-    if 'sentence' not in df.columns:
-        st.error("CSV must have a column named 'sentence'")
     else:
-        sentences = df['sentence'].dropna().tolist()
-if manual_input:
-    manual_sentences = [s.strip() for s in manual_input.splitlines() if s.strip()]
-    sentences.extend(manual_sentences)
-if sentences:
-    st.success(f"✅ {len(sentences)} sentences ready for embedding generation")
-# Model selection
-model_choice = st.selectbox("Select model", list(model_options.keys()))
-# Generate embeddings button
-if st.button("Generate Embeddings"):
-    if not sentences:
-        st.warning("Please add sentences or upload CSV first.")
-    else:
-        with st.spinner("Generating embeddings..."):
-            embeddings = get_embeddings(sentences, model_choice)
-        st.success(f"✅ Generated embeddings for {len(sentences)} sentences")
-        # Show shape of first few embeddings
-        st.write(f"Embedding shape: {embeddings.shape}")
-        st.write("First 2 sentences and their embeddings (truncated):")
-        for i, sent in enumerate(sentences[:2]):
-            st.write(f"{i+1}: {sent}")
-            st.write(embeddings[i][:10], "...")  # show first 10 dims only
-        # Prepare CSV for download
-        csv_file = export_embeddings(sentences, embeddings)
-        st.download_button(
-            label="📥 Download CSV with Embeddings",
-            data=csv_file,
-            file_name=f"{model_choice.replace(' ','_')}_embeddings.csv"
         )
-# Clear button
-if st.button("Clear"):
-    st.experimental_rerun()

 # ---------------- Imports ----------------
 import torch
 import numpy as np
 import tensorflow as tf
 import tensorflow_hub as hub
 from transformers import BertTokenizer, BertModel
+import gradio as gr
+from sklearn.metrics.pairwise import cosine_similarity
 import pandas as pd
+import json
 import io
+# ---------------- Load models once ----------------
 model_options = {
     "BERT Large Uncased": "bert-large-uncased",
     "BERT Large Cased": "bert-large-cased",
     "BERT Base Uncased": "bert-base-uncased",
+    "BERT Base Cased": "bert-base-cased"
 }
+# Default model
+current_model_name = "bert-large-uncased"
+# Load ELMo (TF Hub)
 elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", trainable=False)
+# Load BERT (HuggingFace Transformers) - will be reloaded when model changes
+tokenizer = BertTokenizer.from_pretrained(current_model_name)
+bert_model = BertModel.from_pretrained(current_model_name)
+bert_model.eval()  # disable training mode
+# Global variables to store embeddings as matrices
+bert_embeddings_matrix = None
+elmo_embeddings_matrix = None
+sentences_storage = []
 current_bert_model = None
 current_tokenizer = None
 def load_bert_model(model_name):
+    """Load BERT model and tokenizer"""
     global current_bert_model, current_tokenizer
+    try:
+        current_tokenizer = BertTokenizer.from_pretrained(model_name)
+        current_bert_model = BertModel.from_pretrained(model_name)
+        current_bert_model.eval()
+        return f"✅ Loaded {model_name}"
+    except Exception as e:
+        return f"❌ Error loading {model_name}: {str(e)}"
+# Initialize with default model
+load_bert_model(current_model_name)
+# ---------------- Single sentence embedding function ----------------
+def get_single_embedding(sentence):
+    """Get BERT and ELMo embeddings for a single sentence"""
+    global current_bert_model, current_tokenizer
+    # ------------ BERT ------------ #
+    input_bert = current_tokenizer([sentence], return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        bert_output = current_bert_model(**input_bert)  # [1, seq_len, hidden_size]
+        token_embeddings = bert_output.last_hidden_state  # tensor: (1, seq_len, 1024 for large)
+    attention_mask = input_bert['attention_mask'].unsqueeze(-1)  # (1, seq_len, 1)
+    masked_embeddings = token_embeddings * attention_mask
+    bert_embedding = masked_embeddings.sum(1) / attention_mask.sum(1)  # mean pooling → (1, hidden_size)
+    bert_embedding = bert_embedding.squeeze(0).numpy()  # Remove batch dimension and convert to numpy
+    # ------------ ELMo ------------ #
+    input_elmo = tf.convert_to_tensor([sentence], dtype=tf.string)
+    elmo_emb = elmo(input_elmo)  # Default output is sentence-level embedding
+    # ELMo typically returns a dictionary with different outputs, get the default embedding
+    if isinstance(elmo_emb, dict):
+        elmo_embedding = elmo_emb['default']  # or try 'elmo' key
     else:
+        elmo_embedding = elmo_emb
+    elmo_embedding = elmo_embedding.numpy().squeeze()  # Convert to numpy and remove extra dimensions
+    return bert_embedding, elmo_embedding
+def change_bert_model(model_choice):
+    """Change BERT model and clear existing embeddings"""
+    global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
+    model_name = model_options[model_choice]
+    status = load_bert_model(model_name)
+    # Clear existing embeddings since we changed the model
+    bert_embeddings_matrix = None
+    elmo_embeddings_matrix = None
+    sentences_storage = []
+    clear_status = "🔄 Model changed! Previous embeddings cleared. Please add sentences again."
+    return status, clear_status, "📝 No sentences added yet. Please add at least 2 sentences."
+# ---------------- Add sentence function ----------------
+def add_sentence(sentence):
+    """Add a sentence and compute its embeddings"""
+    global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
+    if not sentence.strip():
+        return "Please enter a valid sentence.", get_current_status()
+    sentence = sentence.strip()
+    try:
+        # Get embeddings for this sentence
+        bert_emb, elmo_emb = get_single_embedding(sentence)
+        # Add to matrices row by row
+        if bert_embeddings_matrix is None:
+            # First sentence - initialize matrices
+            bert_embeddings_matrix = bert_emb.reshape(1, -1)  # Make it 2D [1, features]
+            elmo_embeddings_matrix = elmo_emb.reshape(1, -1)  # Make it 2D [1, features]
+        else:
+            # Add as new row using vstack
+            bert_embeddings_matrix = np.vstack([bert_embeddings_matrix, bert_emb.reshape(1, -1)])
+            elmo_embeddings_matrix = np.vstack([elmo_embeddings_matrix, elmo_emb.reshape(1, -1)])
+        # Store sentence
+        sentences_storage.append(sentence)
+        return f"✓ Added sentence {len(sentences_storage)}: '{sentence}'", get_current_status()
+    except Exception as e:
+        return f"❌ Error processing sentence: {str(e)}", get_current_status()
+# ---------------- Get current status ----------------
+def get_current_status():
+    """Return current status of stored sentences"""
+    if len(sentences_storage) == 0:
+        return "📝 No sentences added yet. Please add at least 2 sentences."
+    elif len(sentences_storage) == 1:
+        return f"📝 Current sentences ({len(sentences_storage)}/2 minimum):\n1: {sentences_storage[0]}\n\n➕ Add at least 1 more sentence to compute similarity."
     else:
+        status = f"📝 Current sentences ({len(sentences_storage)}):\n"
+        for i, sent in enumerate(sentences_storage):
+            status += f"{i+1}: {sent}\n"
+        status += f"\n✅ Ready to compute similarity!"
+        return status
+# ---------------- Compute similarity ----------------
+def compute_similarity():
+    """Compute similarity matrices for stored embeddings"""
+    global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
+    if len(sentences_storage) < 2:
+        return "⚠️ Please add at least 2 sentences before computing similarity."
+    try:
+        # Convert to torch tensors for torch.cosine_similarity
+        bert_tensor = torch.tensor(bert_embeddings_matrix, dtype=torch.float32)
+        elmo_tensor = torch.tensor(elmo_embeddings_matrix, dtype=torch.float32)
+        # Compute pairwise cosine similarity using torch
+        def torch_pairwise_cosine_similarity(X):
+            # Normalize vectors
+            X_norm = torch.nn.functional.normalize(X, p=2, dim=1)
+            # Compute similarity matrix
+            return torch.mm(X_norm, X_norm.t())
+        bert_sim_torch = torch_pairwise_cosine_similarity(bert_tensor)
+        elmo_sim_torch = torch_pairwise_cosine_similarity(elmo_tensor)
+        # Convert back to numpy for display
+        bert_sim = bert_sim_torch.numpy()
+        elmo_sim = elmo_sim_torch.numpy()
+        # Alternative: Use sklearn for comparison
+        bert_sim_sklearn = cosine_similarity(bert_embeddings_matrix)
+        elmo_sim_sklearn = cosine_similarity(elmo_embeddings_matrix)
+        # Format output
+        result = f"🔍 Similarity Analysis for {len(sentences_storage)} sentences:\n\n"
+        result += "🤖 BERT Similarity Matrix (PyTorch):\n"
+        result += f"{np.round(bert_sim, 3)}\n\n"
+        result += "🧠 ELMo Similarity Matrix (PyTorch):\n"
+        result += f"{np.round(elmo_sim, 3)}\n\n"
+        # Show comparison with sklearn (optional)
+        result += "📊 Comparison Check:\n"
+        result += f"BERT torch vs sklearn max diff: {np.max(np.abs(bert_sim - bert_sim_sklearn)):.6f}\n"
+        result += f"ELMo torch vs sklearn max diff: {np.max(np.abs(elmo_sim - elmo_sim_sklearn)):.6f}\n\n"
+        result += "📄 Sentences Reference:\n"
+        for i, sentence in enumerate(sentences_storage):
+            result += f"{i+1}: {sentence}\n"
+        # Add matrix shapes info
+        result += f"\n📊 Matrix Details:\n"
+        result += f"BERT embeddings shape: {bert_embeddings_matrix.shape}\n"
+        result += f"ELMo embeddings shape: {elmo_embeddings_matrix.shape}\n"
+        result += f"Similarity matrices shape: {bert_sim.shape}"
+        return result
+    except Exception as e:
+        return f"❌ Error computing similarity: {str(e)}"
+def clear_all():
+    """Clear all stored sentences and embeddings"""
+    global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage
+    bert_embeddings_matrix = None
+    elmo_embeddings_matrix = None
+    sentences_storage = []
+    return "🗑️ All sentences cleared.", "📝 No sentences added yet. Please add at least 2 sentences."
+# ---------------- Gradio Interface ----------------
+with gr.Blocks(title="BERT + ELMo Sentence Similarity", theme=gr.themes.Soft()) as iface:
+    gr.Markdown("# 🤖 BERT + ELMo Sentence Similarity Analyzer")
+    gr.Markdown("Add sentences one by one (minimum 2) and compute pairwise similarity using BERT and ELMo embeddings.")
+    # Model selection section
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_dropdown = gr.Dropdown(
+                choices=list(model_options.keys()),
+                value="BERT Large Uncased",
+                label="🔧 Select BERT Model",
+                info="Choose between cased/uncased and base/large variants"
+            )
+            model_status = gr.Textbox(
+                label="📋 Model Status",
+                value="✅ Loaded bert-large-uncased",
+                lines=1,
+                interactive=False
+            )
+    with gr.Row():
+        with gr.Column(scale=2):
+            sentence_input = gr.Textbox(
+                label="Enter a sentence",
+                placeholder="Type your sentence here... (e.g., 'I love machine learning')",
+                lines=2
+            )
+            with gr.Row():
+                add_btn = gr.Button("➕ Add Sentence", variant="primary", size="lg")
+                compute_btn = gr.Button("🔍 Compute Similarity", variant="secondary", size="lg")
+                clear_btn = gr.Button("🗑️ Clear All", variant="stop", size="lg")
+        with gr.Column(scale=1):
+            status_output = gr.Textbox(
+                label="📋 Current Status",
+                value="📝 No sentences added yet. Please add at least 2 sentences.",
+                lines=8,
+                interactive=False
+            )
+    with gr.Row():
+        result_output = gr.Textbox(
+            label="📊 Similarity Results",
+            lines=20,
+            interactive=False,
+            show_copy_button=True
+        )
         )
+    gr.Markdown("""
+    ### 📖 How to use:
+    1. **Choose Model**: Select your preferred BERT variant (uncased recommended for similarity)
+    2. **Add sentences**: Type a sentence and click "Add Sentence"
+    3. **Repeat**: Add at least 2 sentences (you can add more!)
+    4. **Compute**: Click "Compute Similarity" to see the results
+    5. **Export**: Download embeddings and similarity matrices for further analysis
+    6. **Interpret**: Values closer to 1.0 indicate higher similarity
+    ### 🔬 Models:
+    - **BERT Large Uncased**: Best for semantic similarity (recommended) - 1024 dimensions
+    - **BERT Large Cased**: Preserves capitalization, good for proper nouns - 1024 dimensions
+    - **BERT Base Uncased**: Faster, smaller model - 768 dimensions
+    - **BERT Base Cased**: Cased version of base model - 768 dimensions
+    - **ELMo**: Contextual word representations using LSTM - 1024 dimensions
+    """)
+    # Event handlers
+    model_dropdown.change(
+        fn=change_bert_model,
+        inputs=[model_dropdown],
+        outputs=[model_status, result_output, status_output]
+    )
+    add_btn.click(
+        fn=add_sentence,
+        inputs=[sentence_input],
+        outputs=[result_output, status_output]
+    ).then(
+        lambda: "",  # Clear input after adding
+        outputs=[sentence_input]
+    )
+    compute_btn.click(
+        fn=compute_similarity,
+        outputs=[result_output]
+    )
+    clear_btn.click(
+        fn=clear_all,
+        outputs=[result_output, status_output]
+    )
+if __name__ == "__main__":
+    iface.launch(share=True)