Spaces:

derek-thomas
/

RAGDemo

Paused

derek-thomas commited on Apr 30, 2024

Commit

f687064

1 Parent(s): 21f4f83

Updating to use env var model

Files changed (1) hide show

backend/query_llm.py CHANGED Viewed

@@ -5,14 +5,15 @@ import gradio as gr
 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
 temperature = 0.9
 top_p = 0.6
 repetition_penalty = 1.2
 text_client = InferenceClient(
-        "mistralai/Mistral-7B-Instruct-v0.1",
         token=getenv("HUGGING_FACE_HUB_TOKEN")
         )
@@ -38,7 +39,7 @@ def format_prompt(message: str) -> str:
 def generate(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 256,
              top_p: float = 0.95, repetition_penalty: float = 1.0) -> Generator[str, None, str]:
     """
-    Generate a sequence of tokens based on a given prompt and history using Mistral client.
     Args:
         prompt (str): The initial prompt for the text generation.
@@ -77,12 +78,12 @@ def generate(prompt: str, history: str, temperature: float = 0.9, max_new_tokens
     except Exception as e:
         if "Too Many Requests" in str(e):
-            print("ERROR: Too many requests on Mistral client")
-            gr.Warning("Unfortunately Mistral is unable to process")
             return "Unfortunately, I am not able to process your request now."
         else:
             print("Unhandled Exception:", str(e))
-            gr.Warning("Unfortunately Mistral is unable to process")
             return "I do not know what happened, but I couldn't understand you."
     return output

 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer
+MODEL = getenv("MODEL")
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
 temperature = 0.9
 top_p = 0.6
 repetition_penalty = 1.2
 text_client = InferenceClient(
+        MODEL,
         token=getenv("HUGGING_FACE_HUB_TOKEN")
         )
 def generate(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 256,
              top_p: float = 0.95, repetition_penalty: float = 1.0) -> Generator[str, None, str]:
     """
+    Generate a sequence of tokens based on a given prompt and history using MODEL client.
     Args:
         prompt (str): The initial prompt for the text generation.
     except Exception as e:
         if "Too Many Requests" in str(e):
+            print(f"ERROR: Too many requests on {MODEL} client")
+            gr.Warning(f"Unfortunately {MODEL} is unable to process")
             return "Unfortunately, I am not able to process your request now."
         else:
             print("Unhandled Exception:", str(e))
+            gr.Warning(f"Unfortunately {MODEL} is unable to process")
             return "I do not know what happened, but I couldn't understand you."
     return output