SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 9

Commit

fd247b7

1 Parent(s): 7f28f16

add gen prompt and kwargs dicts

Browse files

Files changed (1) hide show

utils/models.py +23 -4

utils/models.py CHANGED Viewed

@@ -29,6 +29,8 @@ models = {
 }
 # List of model names for easy access
 model_names = list(models.keys())
@@ -101,13 +103,29 @@ def run_inference(model_name, context, question):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     result = ""
-    model_kwargs = {} # make sure qwen3 doesn't use thinking
     if "qwen3" in model_name.lower():
         print(f"Recognized {model_name} as a Qwen3 model. Setting enable_thinking=False.")
-        model_kwargs["enable_thinking"] = False
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", token=True, kwargs=model_kwargs)
         accepts_sys = (
             "System role not supported" not in tokenizer.chat_template
             if tokenizer.chat_template else False # Handle missing chat_template
@@ -126,6 +144,7 @@ def run_inference(model_name, context, question):
             tokenizer=tokenizer,
             device_map='auto',
             trust_remote_code=True,
         )
         text_input = format_rag_prompt(question, context, accepts_sys)
@@ -134,7 +153,7 @@ def run_inference(model_name, context, question):
         if generation_interrupt.is_set():
             return ""
-        outputs = pipe(text_input, max_new_tokens=512)
         result = outputs[0]['generated_text'][-1]['content']
     except Exception as e:

 }
+tokenizer_cache = {}
 # List of model names for easy access
 model_names = list(models.keys())
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     result = ""
+    tokenizer_kwargs = {
+        "add_generation_prompt": True,
+    } # make sure qwen3 doesn't use thinking
+    generation_kwargs = {
+        "max_new_tokens": 512,
+    }
     if "qwen3" in model_name.lower():
         print(f"Recognized {model_name} as a Qwen3 model. Setting enable_thinking=False.")
+        tokenizer_kwargs["enable_thinking"] = False
+        generation_kwargs["enable_thinking"] = False
     try:
+        if model_name in tokenizer_cache:
+            tokenizer = tokenizer_cache[model_name]
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                padding_side="left",
+                token=True,
+                kwargs=tokenizer_kwargs
+                )
+            tokenizer_cache[model_name] = tokenizer
         accepts_sys = (
             "System role not supported" not in tokenizer.chat_template
             if tokenizer.chat_template else False # Handle missing chat_template
             tokenizer=tokenizer,
             device_map='auto',
             trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
         )
         text_input = format_rag_prompt(question, context, accepts_sys)
         if generation_interrupt.is_set():
             return ""
+        outputs = pipe(text_input, max_new_tokens=512, generate_kwargs=generation_kwargs)
         result = outputs[0]['generated_text'][-1]['content']
     except Exception as e: