Spaces:
Configuration error
Configuration error
| # Copyright 2022 Christopher K. Schmitt | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.manifold import TSNE | |
| from sklearn.cluster import DBSCAN | |
| from sklearn.metrics import silhouette_score, calinski_harabasz_score | |
| from pathlib import Path | |
| from bs4 import BeautifulSoup | |
| from argparse import ArgumentParser | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import nltk as nltk | |
| # The list of huggingface transformers with tensorflow | |
| # support and compatible tokenizers. | |
| available_models = { | |
| "bert": "sentence-transformers/multi-qa-distilbert-cos-v1", | |
| "albert": "sentence-transformers/paraphrase-albert-small-v2", | |
| "roberta": "sentence-transformers/all-distilroberta-v1", | |
| } | |
| display_titles = { | |
| "bert": "BERT", | |
| "albert": "ALBERT", | |
| "roberta": "RoBERTa", | |
| } | |
| # Define the CLI interface for modeling our data with | |
| # different transformer models. We want to control the | |
| # type of the tokenizer and the transformer we use, as well | |
| # as the input and output directories | |
| parser = ArgumentParser() | |
| parser.add_argument("-m", "--model", choices=available_models.keys(), required=True) | |
| parser.add_argument("-i", "--input", required=True) | |
| parser.add_argument("-o", "--output", required=True) | |
| args = parser.parse_args() | |
| input_dir = args.input | |
| output_dir = args.output | |
| model_name = available_models[args.model] | |
| display_name = display_titles[args.model] | |
| # To remove random glyphs and other noise, we | |
| # only extract words in the nltk corpus | |
| nltk.download("words") | |
| words = set(nltk.corpus.words.words()) | |
| def extract_words(document): | |
| cleaned = "" | |
| for word in nltk.wordpunct_tokenize(document): | |
| if word.lower() in words: | |
| cleaned += word.lower() + " " | |
| return cleaned | |
| # Iterate over all of the files in the provided data | |
| # directory. Parse each file with beautiful soup to parse | |
| # the relevant text out of the markup. | |
| data = Path(input_dir).iterdir() | |
| data = map(lambda doc: doc.read_bytes(), data) | |
| data = map(lambda doc: BeautifulSoup(doc, "html.parser"), data) | |
| data = map(lambda doc: doc.get_text(), data) | |
| data = filter(lambda doc: len(doc) > 0, data) | |
| data = map(extract_words, data) | |
| data = filter(lambda doc: len(doc) > 10, data) | |
| data = list(data) | |
| # Initilize transformer models and predict all of the | |
| # document embeddings as computed by bert and friends | |
| model = SentenceTransformer(model_name) | |
| embeddings = model.encode(data, show_progress_bar=True) | |
| # Fit TSNE model for embedding space. Sqush down to 2 | |
| # dimentions for visualization purposes. | |
| tsne = TSNE(n_components=2, random_state=2, init="pca", learning_rate="auto", perplexity=40) | |
| tsne = tsne.fit_transform(embeddings) | |
| # Hyperparameter optimizations | |
| silhouettes = [] | |
| outliers = [] | |
| ch = [] | |
| for eps in np.arange(0.001, 1, 0.001): | |
| dbscan = DBSCAN(eps, metric="cosine", n_jobs=-1) | |
| dbscan = dbscan.fit_predict(embeddings) | |
| if len(np.unique(dbscan)) > 1: | |
| silhouettes.append(silhouette_score(embeddings, dbscan, metric="cosine")) | |
| ch.append(calinski_harabasz_score(embeddings, dbscan)) | |
| else: | |
| silhouettes.append(0) | |
| ch.append(0) | |
| outliers.append(len(dbscan[dbscan == -1])) | |
| for p in range(15, 51): | |
| best = np.argmax(silhouettes) | |
| dbscan = DBSCAN(0.001 + 0.001 * best, metric="cosine", n_jobs=-1) | |
| dbscan = dbscan.fit_predict(embeddings) | |
| tsne = TSNE(n_components=2, perplexity=p, learning_rate="auto", init="pca", metric="cosine") | |
| tsne = tsne.fit_transform(embeddings) | |
| plt.figure() | |
| plt.scatter(tsne[dbscan != -1][:, 0], tsne[dbscan != -1][:, 1], s=0.5, c=dbscan[dbscan != -1], cmap="hsv") | |
| plt.scatter(tsne[dbscan == -1][:, 0], tsne[dbscan == -1][:, 1], s=0.5, c="#abb8c3") | |
| plt.title(f"{display_name} Embeddings Visualized with T-SNE (p = {p})") | |
| plt.savefig(f"{output_dir}/tnse_{p:02}.png", format="png", dpi=600) | |
| plt.close() | |
| plt.figure() | |
| plt.plot(np.arange(0.001, 1, 0.001), silhouettes, lw=0.5, color="#dc322f") | |
| plt.legend() | |
| plt.xlabel("Epsilon") | |
| plt.ylabel("silhouette score") | |
| plt.title("Optimizing Epsilon by Silhouette Score") | |
| plt.savefig(f"silhouettes.png", format="png", dpi=600) | |
| plt.close() | |
| plt.figure() | |
| plt.plot(np.arange(0.001, 1, 0.001), outliers, lw=0.5, color="#dc322f") | |
| plt.legend() | |
| plt.xlabel("Epsilon") | |
| plt.ylabel("outliers") | |
| plt.title("Optimizing Epsilon by Number of Outliers") | |
| plt.savefig(f"outliers.png", format="png", dpi=600) | |
| plt.close() | |
| plt.figure() | |
| plt.plot(np.arange(0.001, 1, 0.001), ch, lw=0.5, color="#dc322f") | |
| plt.legend() | |
| plt.xlabel("Epsilon") | |
| plt.ylabel("Calinski-Harabasz score") | |
| plt.title("Optimizing Epsilon by Calinski-Harabasz Score") | |
| plt.savefig(f"calinski-harabasz.png", format="png", dpi=600) | |
| plt.close() |