Spaces:

taskswithcode
/

semantic_search

Runtime error

App Files Files Community

taskswithcode commited on Sep 30, 2022

Commit

b65a786

1 Parent(s): fb73c83

Fixes

Browse files

Files changed (16) hide show

app.py +23 -15
doc_app_models.json +61 -1
text-search-ada-doc-001_planets_qna_search.json +0 -0
text-search-ada-doc-001_qna2_search.json +0 -0
text-search-ada-doc-001_qna_search.json +0 -0
text-search-babbage-doc-001_planets_qna_search.json +0 -0
text-search-babbage-doc-001_qna2_search.json +0 -0
text-search-babbage-doc-001_qna_search.json +0 -0
text-search-curie-doc-001_planets_qna_search.json +0 -0
text-search-curie-doc-001_qna2_search.json +0 -0
text-search-curie-doc-001_qna_search.json +0 -0
text-search-davinci-doc-001_planets_qna_search.json +0 -0
text-search-davinci-doc-001_qna2_search.json +0 -0
text-search-davinci-doc-001_qna_search.json +0 -0
twc_embeddings.py +6 -6
twc_openai_search.py +124 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from io import StringIO
 import pdb
 import json
 from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
 import torch
 import requests
 import socket
@@ -59,7 +60,7 @@ def get_views(action):
 def construct_model_info_for_display(model_names):
     options_arr  = []
-    markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>These are either state-of-the-art or the most downloaded models on Hugging Face</i></div>"
     markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
     for node in model_names:
         options_arr .append(node["name"])
@@ -102,15 +103,15 @@ def load_model(model_name,model_class,load_model_name):
 @st.experimental_memo
-def cached_compute_similarity(sentences,_model,model_name,main_index):
-    texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
     results = _model.output_results(None,texts,embeddings,main_index)
     return results
-def uncached_compute_similarity(sentences,_model,model_name,main_index):
     with st.spinner('Computing vectors for sentences'):
-        texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
         results = _model.output_results(None,texts,embeddings,main_index)
     #st.success("Similarity computation complete")
     return results
@@ -123,7 +124,7 @@ def get_model_info(model_names,model_name):
     return get_model_info(model_names,DEFAULT_HF_MODEL)
-def run_test(model_names,model_name,sentences,display_area,main_index,user_uploaded,custom_model):
     display_area.text("Loading model:" + model_name)
     #Note. model_name may get mapped to new name in the call below for custom models
     orig_model_name = model_name
@@ -135,14 +136,18 @@ def run_test(model_names,model_name,sentences,display_area,main_index,user_uploa
     if ("Note" in model_info):
         fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
         display_area.write(fail_link)
     model = load_model(model_name,model_info["class"],load_model_name)
     display_area.text("Model " + model_name  + " load complete")
     try:
             if (user_uploaded):
-                results = uncached_compute_similarity(sentences,model,model_name,main_index)
             else:
                 display_area.text("Computing vectors for sentences")
-                results = cached_compute_similarity(sentences,model,model_name,main_index)
                 display_area.text("Similarity computation complete")
             return results
@@ -254,15 +259,18 @@ def app_main(app_mode,example_files,model_name_files):
                 run_model = selected_model
             st.session_state["model_name"] = selected_model
             st.session_state["main_index"] = main_index
-            results = run_test(model_names,run_model,sentences,display_area,main_index - 1,(uploaded_file is not None),(len(custom_model_selection) != 0))
             display_area.empty()
             with display_area.container():
-                device = 'GPU' if torch.cuda.is_available() else 'CPU'
-                response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
-                if (len(custom_model_selection) != 0):
-                    st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
-                display_results(sentences,main_index - 1,results,response_info,app_mode,run_model)
-                #st.json(results)
       st.download_button(
          label="Download results as json",
          data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",

 import pdb
 import json
 from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
+from twc_openai_search import OpenAIQnAModel
 import torch
 import requests
 import socket
 def construct_model_info_for_display(model_names):
     options_arr  = []
+    markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>The selected models satisfy one or more of the following (1) state-of-the-art (2) the most downloaded models on Hugging Face (3) Large Language Models (e.g. GPT-3)</i></div>"
     markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
     for node in model_names:
         options_arr .append(node["name"])
 @st.experimental_memo
+def cached_compute_similarity(input_file_name,sentences,_model,model_name,main_index):
+    texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
     results = _model.output_results(None,texts,embeddings,main_index)
     return results
+def uncached_compute_similarity(input_file_name,sentences,_model,model_name,main_index):
     with st.spinner('Computing vectors for sentences'):
+        texts,embeddings = _model.compute_embeddings(input_file_name,sentences,is_file=False)
         results = _model.output_results(None,texts,embeddings,main_index)
     #st.success("Similarity computation complete")
     return results
     return get_model_info(model_names,DEFAULT_HF_MODEL)
+def run_test(model_names,model_name,input_file_name,sentences,display_area,main_index,user_uploaded,custom_model):
     display_area.text("Loading model:" + model_name)
     #Note. model_name may get mapped to new name in the call below for custom models
     orig_model_name = model_name
     if ("Note" in model_info):
         fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
         display_area.write(fail_link)
+    if (user_uploaded and "custom_load" in model_info and model_info["custom_load"] == "False"):
+        fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
+        display_area.write(fail_link)
+        return {"error":fail_link}
     model = load_model(model_name,model_info["class"],load_model_name)
     display_area.text("Model " + model_name  + " load complete")
     try:
             if (user_uploaded):
+                results = uncached_compute_similarity(input_file_name,sentences,model,model_name,main_index)
             else:
                 display_area.text("Computing vectors for sentences")
+                results = cached_compute_similarity(input_file_name,sentences,model,model_name,main_index)
                 display_area.text("Similarity computation complete")
             return results
                 run_model = selected_model
             st.session_state["model_name"] = selected_model
             st.session_state["main_index"] = main_index
+            results = run_test(model_names,run_model,st.session_state["file_name"],sentences,display_area,main_index - 1,(uploaded_file is not None),(len(custom_model_selection) != 0))
             display_area.empty()
             with display_area.container():
+                if ("error" in results):
+                    st.error(results["error"])
+                else:
+                    device = 'GPU' if torch.cuda.is_available() else 'CPU'
+                    response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
+                    if (len(custom_model_selection) != 0):
+                        st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
+                    display_results(sentences,main_index - 1,results,response_info,app_mode,run_model)
+                    #st.json(results)
       st.download_button(
          label="Download results as json",
          data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",

doc_app_models.json CHANGED Viewed

@@ -108,7 +108,67 @@
                             },
                 "paper_url":"https://arxiv.org/abs/2104.08821v4",
                 "mark":"True",
-                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
             ]

                             },
                 "paper_url":"https://arxiv.org/abs/2104.08821v4",
                 "mark":"True",
+                "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
+            {  "name":"GPT-3-175B (text-search-davinci-doc-001)" ,
+                "model":"text-search-davinci-doc-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIQnAModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-6.7B (text-search-curie-doc-001)" ,
+                "model":"text-search-curie-doc-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIQnAModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-1.3B (text-search-babbage-doc-001)" ,
+                "model":"text-search-babbage-doc-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIQnAModel","sota_link":"https://arxiv.org/abs/2005.14165v4"},
+            {  "name":"GPT-3-350M (text-search-ada-doc-001)" ,
+                "model":"text-search-ada-doc-001",
+                "fork_url":"https://openai.com/api/",
+                "orig_author_url":"https://openai.com/api/",
+                "orig_author":"OpenAI",
+                "sota_info": {
+                                 "task":"GPT-3 achieves strong zero-shot and few-shot performance on many NLP datasets etc.",
+                                 "sota_link":"https://paperswithcode.com/method/gpt-3"
+                            },
+                "paper_url":"https://arxiv.org/abs/2005.14165v4",
+                "mark":"True",
+                "custom_load":"False",
+                "Note":"Custom file upload requires OpenAI API access to create embeddings. For API access, use this link ",
+                "alt_url":"https://openai.com/api/",
+                "class":"OpenAIQnAModel","sota_link":"https://arxiv.org/abs/2005.14165v4"}
             ]

text-search-ada-doc-001_planets_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-ada-doc-001_qna2_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-ada-doc-001_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-babbage-doc-001_planets_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-babbage-doc-001_qna2_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-babbage-doc-001_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-curie-doc-001_planets_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-curie-doc-001_qna2_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-curie-doc-001_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-davinci-doc-001_planets_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-davinci-doc-001_qna2_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text-search-davinci-doc-001_qna_search.json ADDED Viewed

The diff for this file is too large to render. See raw diff

twc_embeddings.py CHANGED Viewed

@@ -32,7 +32,7 @@ class CausalLMModel:
         self.model.eval()
         self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
-    def compute_embeddings(self,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
@@ -160,7 +160,7 @@ class SGPTQnAModel:
         return embeddings
-    def compute_embeddings(self,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
@@ -215,7 +215,7 @@ class SimCSEModel:
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = AutoModel.from_pretrained(model_name)
-    def compute_embeddings(self,input_data,is_file):
         texts = read_text(input_data) if is_file == True else input_data
         inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
         with torch.no_grad():
@@ -266,7 +266,7 @@ class SGPTModel:
         # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
         self.model.eval()
-    def compute_embeddings(self,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
@@ -353,7 +353,7 @@ class HFModel:
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    def compute_embeddings(self,input_data,is_file):
         #print("Computing embeddings for:", input_data[:20])
         model = self.model
         tokenizer = self.tokenizer
@@ -403,5 +403,5 @@ if __name__ == '__main__':
         results = parser.parse_args()
         obj = HFModel()
         obj.init_model(results.model)
-        texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
         results = obj.output_results(results.output,texts,embeddings)

         self.model.eval()
         self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
         return embeddings
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.model = AutoModel.from_pretrained(model_name)
+    def compute_embeddings(self,input_file_name,input_file,input_data,is_file):
         texts = read_text(input_data) if is_file == True else input_data
         inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
         with torch.no_grad():
         # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
         self.model.eval()
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         if (self.debug):
             print("Computing embeddings for:", input_data[:20])
         model = self.model
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def compute_embeddings(self,input_file_name,input_data,is_file):
         #print("Computing embeddings for:", input_data[:20])
         model = self.model
         tokenizer = self.tokenizer
         results = parser.parse_args()
         obj = HFModel()
         obj.init_model(results.model)
+        texts, embeddings = obj.compute_embeddings(results.input,results.input,is_file = True)
         results = obj.output_results(results.output,texts,embeddings)

twc_openai_search.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from scipy.spatial.distance import cosine
+import argparse
+import json
+import os
+import openai
+import pdb
+def read_text(input_file):
+    arr = open(input_file).read().split("\n")
+    return arr[:-1]
+class OpenAIQnAModel:
+    def __init__(self):
+        self.debug = False
+        self.q_model_name = None
+        self.d_model_name = None
+        self.skip_key = True
+        print("In OpenAI API constructor")
+    def init_model(self,model_name = None):
+        #print("OpenAI: Init model",model_name)
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        if (openai.api_key == None):
+            openai.api_key = ""
+            print("API key not set")
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+        if (model_name is None):
+            self.d_model_name = "text-search-ada-doc-001"
+        else:
+            self.d_model_name = model_name
+        self.q_model_name = self.construct_query_model_name(self.d_model_name)
+        print(f"OpenAI: Init model complete :query model {self.q_model_name} doc:{self.d_model_name}")
+    def construct_query_model_name(self,d_model_name):
+        return d_model_name.replace('-doc-','-query-')
+    def compute_embeddings(self,input_file_name,input_data,is_file):
+        if (len(openai.api_key) == 0 and not self.skip_key):
+                print("Open API key not set")
+                return [],[]
+        #print("In compute embeddings after key check")
+        in_file = input_file_name.split('/')[-1]
+        in_file = self.d_model_name + '_' +  '.'.join(in_file.split('.')[:-1]) + "_search.json"
+        cached = False
+        try:
+            fp = open(in_file)
+            cached = True
+            embeddings = json.load(fp)
+            q_embeddings = [embeddings[0]]
+            d_embeddings = embeddings[1:]
+            print("Using cached embeddings")
+        except:
+            pass
+        texts = read_text(input_data) if is_file == True else input_data
+        queries = [texts[0]]
+        docs = texts[1:]
+        if (not cached):
+            print(f"Computing embeddings for {input_file_name} and query model {self.q_model_name}")
+            query_embeds = openai.Embedding.create(
+                input=queries,
+                model=self.q_model_name
+            )
+            print(f"Computing embeddings for {input_file_name} and doc model {self.q_model_name}")
+            doc_embeds = openai.Embedding.create(
+                input=docs,
+                model=self.d_model_name
+            )
+            q_embeddings = []
+            d_embeddings = []
+            for i in range(len(query_embeds['data'])):
+                q_embeddings.append(query_embeds['data'][i]['embedding'])
+            for i in range(len(doc_embeds['data'])):
+                d_embeddings.append(doc_embeds['data'][i]['embedding'])
+        if (not cached):
+            embeddings = q_embeddings + d_embeddings
+            with open(in_file,"w") as fp:
+                json.dump(embeddings,fp)
+        return texts,(q_embeddings,d_embeddings)
+    def output_results(self,output_file,texts,embeddings,main_index = 0):
+        # Calculate cosine similarities
+        # Cosine similarities are in [-1, 1]. Higher means more similar
+        query_embeddings = embeddings[0]
+        doc_embeddings = embeddings[1]
+        cosine_dict = {}
+        queries = [texts[0]]
+        docs = texts[1:]
+        if (self.debug):
+            print("Total sentences",len(texts))
+        for i in range(len(docs)):
+            cosine_dict[docs[i]] = 1 - cosine(query_embeddings[0], doc_embeddings[i])
+        if (self.debug):
+            print("Input sentence:",texts[main_index])
+        sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
+        if (self.debug):
+            for key in sorted_dict:
+                print("Cosine similarity with  \"%s\" is: %.3f" % (key, sorted_dict[key]))
+        if (output_file is not None):
+            with open(output_file,"w") as fp:
+                fp.write(json.dumps(sorted_dict,indent=0))
+        return sorted_dict
+if __name__ == '__main__':
+        parser = argparse.ArgumentParser(description='OpenAI model for document search embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
+        parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
+        parser.add_argument('-model', action="store", dest="model",default="text-search-ada-doc-001",help="model name")
+        results = parser.parse_args()
+        obj = OpenAIQnAModel()
+        obj.init_model(results.model)
+        texts, embeddings = obj.compute_embeddings(results.input,results.input,is_file = True)
+        results = obj.output_results(results.output,texts,embeddings)