Spaces:
Runtime error
Runtime error
| import gzip | |
| import json | |
| from collections import Counter | |
| import pandas as pd | |
| import numpy as np | |
| import jax.numpy as jnp | |
| import tqdm | |
| from sentence_transformers import util | |
| from typing import List, Union | |
| import torch | |
| from backend.utils import load_model, filter_questions, load_embeddings | |
| from sklearn.manifold import TSNE | |
| def cos_sim(a, b): | |
| return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b)) | |
| # We get similarity between embeddings. | |
| def text_similarity(anchor: str, inputs: List[str], model_name: str, model_dict: dict): | |
| print(model_name) | |
| model = load_model(model_name, model_dict) | |
| # Creating embeddings | |
| if hasattr(model, 'encode'): | |
| anchor_emb = model.encode(anchor)[None, :] | |
| inputs_emb = model.encode(inputs) | |
| else: | |
| assert len(model) == 2 | |
| anchor_emb = model[0].encode(anchor)[None, :] | |
| inputs_emb = model[1].encode(inputs) | |
| # Obtaining similarity | |
| similarity = list(jnp.squeeze(cos_sim(anchor_emb, inputs_emb))) | |
| # Returning a Pandas' dataframe | |
| d = {'inputs': inputs, | |
| 'score': [round(similarity[i], 3) for i in range(len(similarity))]} | |
| df = pd.DataFrame(d, columns=['inputs', 'score']) | |
| return df | |
| # Search | |
| def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict): | |
| # Proceeding with model | |
| print(model_name) | |
| assert model_name == "distilbert_qa" | |
| model = load_model(model_name, model_dict) | |
| # Creating embeddings | |
| query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] | |
| print("loading embeddings") | |
| corpus_emb = load_embeddings() | |
| # Getting hits | |
| hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] | |
| filtered_posts = filter_questions("python") | |
| print(f"{len(filtered_posts)} posts found with tag: python") | |
| hits_titles = [] | |
| hits_scores = [] | |
| urls = [] | |
| for hit in hits: | |
| post = filtered_posts[hit['corpus_id']] | |
| hits_titles.append(post['title']) | |
| hits_scores.append("{:.3f}".format(hit['score'])) | |
| urls.append(f"https://stackoverflow.com/q/{post['id']}") | |
| return hits_titles, hits_scores, urls | |
| def text_cluster(anchor: str, n_answers: int, model_name: str, model_dict: dict): | |
| # Proceeding with model | |
| print(model_name) | |
| assert model_name == "distilbert_qa" | |
| model = load_model(model_name, model_dict) | |
| # Creating embeddings | |
| query_emb = model.encode(anchor, convert_to_tensor=True)[None, :] | |
| print("loading embeddings") | |
| corpus_emb = load_embeddings() | |
| # Getting hits | |
| hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0] | |
| filtered_posts = filter_questions("python") | |
| hits_dict = [filtered_posts[hit['corpus_id']] for hit in hits] | |
| hits_dict.append(dict(id = '1', title = anchor, tags = [''])) | |
| hits_emb = torch.stack([corpus_emb[hit['corpus_id']] for hit in hits]) | |
| hits_emb = torch.cat((hits_emb, query_emb)) | |
| # Dimensionality reduction with t-SNE | |
| tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=1000) | |
| tsne_results = tsne.fit_transform(hits_emb.cpu()) | |
| df = pd.DataFrame(hits_dict) | |
| tags = list(df['tags']) | |
| counter = Counter(tags[0]) | |
| for i in tags[1:]: | |
| counter.update(i) | |
| df_tags = pd.DataFrame(counter.most_common(), columns=['Tag', 'Mentions']) | |
| most_common_tags = list(df_tags['Tag'])[1:5] | |
| labels = [] | |
| for tags_list in list(df['tags']): | |
| for common_tag in most_common_tags: | |
| if common_tag in tags_list: | |
| labels.append(common_tag) | |
| break | |
| elif common_tag != most_common_tags[-1]: | |
| continue | |
| else: | |
| labels.append('others') | |
| df['title'] = [post['title'] for post in hits_dict] | |
| df['labels'] = labels | |
| df['tsne_x'] = tsne_results[:, 0] | |
| df['tsne_y'] = tsne_results[:, 1] | |
| df['tsne_z'] = tsne_results[:, 2] | |
| df['size'] = [2 for i in range(len(df))] | |
| # Making the query bigger than the rest of the observations | |
| df['size'][len(df) - 1] = 10 | |
| df['labels'][len(df) - 1] = 'QUERY' | |
| import plotly.express as px | |
| fig = px.scatter_3d(df, x='tsne_x', y='tsne_y', z='tsne_z', color='labels', size='size', | |
| color_discrete_sequence=px.colors.qualitative.D3, hover_data=[df.title]) | |
| return fig | |