Spaces:

jorgemarcc
/

graphcodebert-interpretability

Sleeping

App Files Files Community

jorgemarcc commited on Jul 22

Commit

3fffb69

verified ·

1 Parent(s): 32898cf

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -211

app.py CHANGED Viewed

@@ -1,211 +1,155 @@
-# -*- coding: utf-8 -*-
-"""
-[Martinez-Gil2024] Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks, arXiv preprint arXiv:2410.05275, 2024
-@author: Jorge Martinez-Gil
-"""
-import os
-from transformers import RobertaTokenizer, RobertaModel
-from sklearn.decomposition import PCA
-import matplotlib.pyplot as plt
-import numpy as np
-import itertools
-# Initialize GraphCodeBERT
-tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
-model = RobertaModel.from_pretrained("microsoft/graphcodebert-base")
-# Define the classical sorting algorithms
-sorting_algorithms = {
-    "Bubble_Sort": """
-def bubble_sort(arr):
-    n = len(arr)
-    for i in range(n):
-        for j in range(0, n-i-1):
-            if arr[j] > arr[j+1]:
-                arr[j], arr[j+1] = arr[j+1], arr[j]
-    return arr
-    """,
-    "Selection_Sort": """
-def selection_sort(arr):
-    for i in range(len(arr)):
-        min_idx = i
-        for j in range(i+1, len(arr)):
-            if arr[j] < arr[min_idx]:
-                min_idx = j
-        arr[i], arr[min_idx] = arr[min_idx], arr[i]
-    return arr
-    """,
-    "Insertion_Sort": """
-def insertion_sort(arr):
-    for i in range(1, len(arr)):
-        key = arr[i]
-        j = i-1
-        while j >=0 and key < arr[j]:
-            arr[j + 1] = arr[j]
-            j -= 1
-        arr[j + 1] = key
-    return arr
-    """,
-    "Merge_Sort": """
-def merge_sort(arr):
-    if len(arr) > 1:
-        mid = len(arr)//2
-        L = arr[:mid]
-        R = arr[mid:]
-        merge_sort(L)
-        merge_sort(R)
-        i = j = k = 0
-        while i < len(L) and j < len(R):
-            if L[i] < R[j]:
-                arr[k] = L[i]
-                i += 1
-            else:
-                arr[k] = R[j]
-                j += 1
-            k += 1
-        while i < len(L):
-            arr[k] = L[i]
-            i += 1
-            k += 1
-        while j < len(R):
-            arr[k] = R[j]
-            j += 1
-            k += 1
-    return arr
-    """,
-    "Quick_Sort": """
-def partition(arr, low, high):
-    i = (low-1)
-    pivot = arr[high]
-    for j in range(low, high):
-        if arr[j] <= pivot:
-            i = i+1
-            arr[i], arr[j] = arr[j], arr[i]
-    arr[i+1], arr[high] = arr[high], arr[i+1]
-    return (i+1)
-def quick_sort(arr, low, high):
-    if low < high:
-        pi = partition(arr, low, high)
-        quick_sort(arr, low, pi-1)
-        quick_sort(arr, pi+1, high)
-    return arr
-    """
-}
-# Function to get token embeddings for a code snippet
-def get_token_embeddings(code):
-    inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding=True)
-    outputs = model(**inputs)
-    token_embeddings = outputs.last_hidden_state.squeeze().detach().numpy()
-    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
-    return token_embeddings, tokens
-# Directory to save images
-output_dir = "pca_pairwise_comparisons"
-os.makedirs(output_dir, exist_ok=True)
-# Generate all possible pairs of sorting algorithms
-algorithm_pairs = list(itertools.combinations(sorting_algorithms.keys(), 2))
-# Loop over each pair and generate the visualizations
-for (algo1_name, algo2_name) in algorithm_pairs:
-    algo1_code = sorting_algorithms[algo1_name]
-    algo2_code = sorting_algorithms[algo2_name]
-    # Get token embeddings for both algorithms
-    algo1_embeddings, algo1_tokens = get_token_embeddings(algo1_code)
-    algo2_embeddings, algo2_tokens = get_token_embeddings(algo2_code)
-    # Combine embeddings
-    all_embeddings = np.concatenate((algo1_embeddings, algo2_embeddings), axis=0)
-    # Reduce dimensionality to 2D using PCA
-    pca = PCA(n_components=2)
-    embeddings_2d = pca.fit_transform(all_embeddings)
-    # Plotting the token embeddings in 2D
-    plt.figure(figsize=(10, 8), dpi=300)
-    # Scatter plot for the first algorithm tokens
-    plt.scatter(embeddings_2d[:len(algo1_tokens), 0],
-                embeddings_2d[:len(algo1_tokens), 1],
-                color='red', s=50, label=algo1_name, alpha=0.8)
-    # Scatter plot for the second algorithm tokens
-    plt.scatter(embeddings_2d[len(algo1_tokens):, 0],
-                embeddings_2d[len(algo1_tokens):, 1],
-                color='blue', s=50, label=algo2_name, alpha=0.8)
-    # Make the visualization more professional
-    plt.xticks([])
-    plt.yticks([])
-    plt.xlabel('')
-    plt.ylabel('')
-    plt.grid(False)
-    plt.legend()
-    # Save the figure as a high-quality PNG file
-    output_file = os.path.join(output_dir, f"{algo1_name}_vs_{algo2_name}_tokens_2d_pca.png")
-    plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight')
-    # Show the plot
-    plt.close()
-print("All pairwise comparison images have been generated.")
-import gradio as gr
-from io import BytesIO
-from PIL import Image
-def compare_algorithms(algo1_name, algo2_name):
-    algo1_code = sorting_algorithms[algo1_name]
-    algo2_code = sorting_algorithms[algo2_name]
-    # Get token embeddings
-    algo1_embeddings, algo1_tokens = get_token_embeddings(algo1_code)
-    algo2_embeddings, algo2_tokens = get_token_embeddings(algo2_code)
-    # Combine and reduce
-    all_embeddings = np.concatenate((algo1_embeddings, algo2_embeddings), axis=0)
-    pca = PCA(n_components=2)
-    embeddings_2d = pca.fit_transform(all_embeddings)
-    # Plot
-    plt.figure(figsize=(6, 5), dpi=150)
-    plt.scatter(embeddings_2d[:len(algo1_tokens), 0], embeddings_2d[:len(algo1_tokens), 1], color='red', s=20, label=algo1_name)
-    plt.scatter(embeddings_2d[len(algo1_tokens):, 0], embeddings_2d[len(algo1_tokens):, 1], color='blue', s=20, label=algo2_name)
-    plt.xticks([]); plt.yticks([]); plt.grid(False); plt.legend()
-    # Save to BytesIO
-    buf = BytesIO()
-    plt.savefig(buf, format='png', bbox_inches='tight')
-    plt.close()
-    buf.seek(0)
-    return Image.open(buf)
-interface = gr.Interface(
-    fn=compare_algorithms,
-    inputs=[
-        gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 1"),
-        gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 2")
-    ],
-    outputs=gr.Image(type="pil", label="Token PCA Plot"),
-    title="Code Similarity Visualization with GraphCodeBERT"
-)
-if __name__ == "__main__":
-    interface.launch()

+# -*- coding: utf-8 -*-
+"""
+[Martinez-Gil2024] Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks, arXiv preprint arXiv:2410.05275, 2024
+@author: Jorge Martinez-Gil
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from transformers import RobertaTokenizer, RobertaModel
+import torch
+import gradio as gr
+from io import BytesIO
+from PIL import Image
+# Load GraphCodeBERT model
+tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
+model = RobertaModel.from_pretrained("microsoft/graphcodebert-base")
+# Define sorting algorithms as strings
+sorting_algorithms = {
+    "Bubble_Sort": """
+def bubble_sort(arr):
+    n = len(arr)
+    for i in range(n):
+        for j in range(0, n-i-1):
+            if arr[j] > arr[j+1]:
+                arr[j], arr[j+1] = arr[j+1], arr[j]
+    return arr
+""",
+    "Selection_Sort": """
+def selection_sort(arr):
+    for i in range(len(arr)):
+        min_idx = i
+        for j in range(i+1, len(arr)):
+            if arr[j] < arr[min_idx]:
+                min_idx = j
+        arr[i], arr[min_idx] = arr[min_idx], arr[i]
+    return arr
+""",
+    "Insertion_Sort": """
+def insertion_sort(arr):
+    for i in range(1, len(arr)):
+        key = arr[i]
+        j = i-1
+        while j >= 0 and key < arr[j]:
+            arr[j + 1] = arr[j]
+            j -= 1
+        arr[j + 1] = key
+    return arr
+""",
+    "Merge_Sort": """
+def merge_sort(arr):
+    if len(arr) > 1:
+        mid = len(arr) // 2
+        L = arr[:mid]
+        R = arr[mid:]
+        merge_sort(L)
+        merge_sort(R)
+        i = j = k = 0
+        while i < len(L) and j < len(R):
+            if L[i] < R[j]:
+                arr[k] = L[i]
+                i += 1
+            else:
+                arr[k] = R[j]
+                j += 1
+            k += 1
+        while i < len(L):
+            arr[k] = L[i]
+            i += 1
+            k += 1
+        while j < len(R):
+            arr[k] = R[j]
+            j += 1
+            k += 1
+    return arr
+""",
+    "Quick_Sort": """
+def partition(arr, low, high):
+    i = (low - 1)
+    pivot = arr[high]
+    for j in range(low, high):
+        if arr[j] <= pivot:
+            i += 1
+            arr[i], arr[j] = arr[j], arr[i]
+    arr[i+1], arr[high] = arr[high], arr[i+1]
+    return (i + 1)
+def quick_sort(arr, low, high):
+    if low < high:
+        pi = partition(arr, low, high)
+        quick_sort(arr, low, pi - 1)
+        quick_sort(arr, pi + 1, high)
+    return arr
+"""
+}
+# Get token embeddings for a code snippet
+def get_token_embeddings(code):
+    inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
+    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
+    return token_embeddings, tokens
+# Compare two algorithms and return PCA scatter plot
+def compare_algorithms(algo1_name, algo2_name):
+    code1 = sorting_algorithms[algo1_name]
+    code2 = sorting_algorithms[algo2_name]
+    emb1, tokens1 = get_token_embeddings(code1)
+    emb2, tokens2 = get_token_embeddings(code2)
+    combined = np.concatenate([emb1, emb2], axis=0)
+    pca = PCA(n_components=2)
+    coords = pca.fit_transform(combined)
+    plt.figure(figsize=(6, 5), dpi=150)
+    plt.scatter(coords[:len(tokens1), 0], coords[:len(tokens1), 1], color='red', label=algo1_name, s=20)
+    plt.scatter(coords[len(tokens1):, 0], coords[len(tokens1):, 1], color='blue', label=algo2_name, s=20)
+    plt.legend()
+    plt.xticks([]); plt.yticks([]); plt.grid(False)
+    buf = BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight')
+    plt.close()
+    buf.seek(0)
+    return Image.open(buf)
+# Gradio interface
+interface = gr.Interface(
+    fn=compare_algorithms,
+    inputs=[
+        gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 1"),
+        gr.Dropdown(choices=list(sorting_algorithms.keys()), label="Algorithm 2")
+    ],
+    outputs=gr.Image(type="pil", label="Token Embedding PCA"),
+    title="GraphCodeBERT Token Embedding Comparison",
+    description="Visual comparison of token-level embeddings from GraphCodeBERT for classical sorting algorithms."
+)
+if __name__ == "__main__":
+    interface.launch()