SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 5

Commit

6b26b26

1 Parent(s): c4fe1db

proper threaded generation interrupt

Browse files

Files changed (1) hide show

utils/models.py +118 -35

utils/models.py CHANGED Viewed

@@ -2,6 +2,9 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
@@ -42,84 +45,164 @@ def generate_summaries(example, model_a_name, model_b_name):
     if generation_interrupt.is_set():
         return "", ""
-    summary_a = run_inference(models[model_a_name], context_text, question)
-    if generation_interrupt.is_set():
         return summary_a, ""
-    summary_b = run_inference(models[model_b_name], context_text, question)
     return summary_a, summary_b
-def run_inference(model_name, context, question):
     """
-    Run inference using the specified model.
     """
     if generation_interrupt.is_set():
-        return ""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", token=True)
         accepts_sys = (
             "System role not supported" not in tokenizer.chat_template
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         if generation_interrupt.is_set():
-            return ""
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
         ).to(device)
         text_input = format_rag_prompt(question, context, accepts_sys)
         if generation_interrupt.is_set():
-            return ""
         actual_input = tokenizer.apply_chat_template(
             text_input,
             return_tensors="pt",
             tokenize=True,
-            max_length=2048,
             add_generation_prompt=True,
         ).to(device)
         input_length = actual_input.shape[1]
         attention_mask = torch.ones_like(actual_input).to(device)
         if generation_interrupt.is_set():
-            return ""
         stopping_criteria = StoppingCriteriaList([InterruptCriteria(generation_interrupt)])
         with torch.inference_mode():
             outputs = model.generate(
                 actual_input,
                 attention_mask=attention_mask,
-                max_new_tokens=512,
                 pad_token_id=tokenizer.pad_token_id,
-                stopping_criteria=stopping_criteria
             )
         if generation_interrupt.is_set():
-            return ""
-        result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
-        return result
     except Exception as e:
-        print(f"Error in inference: {e}")
-        return f"Error generating response: {str(e)[:100]}..."
     finally:
-        if 'model' in locals():
-            del model
-        if 'tokenizer' in locals():
-            del tokenizer
         if torch.cuda.is_available():
             torch.cuda.empty_cache()

 from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
+import threading
+import queue
+import time # Added for sleep
 models = {
     "Qwen2.5-1.5b-Instruct": "qwen/qwen2.5-1.5b-instruct",
     if generation_interrupt.is_set():
         return "", ""
+    # Use a queue to get results from threads
+    result_queue_a = queue.Queue()
+    thread_a = threading.Thread(target=run_inference, args=(models[model_a_name], context_text, question, result_queue_a))
+    thread_a.start()
+    summary_a = ""
+    while thread_a.is_alive():
+        if generation_interrupt.is_set():
+            print(f"Interrupting model A ({model_a_name})...")
+            # The InterruptCriteria within the thread will handle stopping generate
+            # We return early from the main control flow.
+            thread_a.join(timeout=1.0) # Give thread a moment to potentially stop
+            return "", ""
+        try:
+            summary_a = result_queue_a.get(timeout=0.1) # Check queue periodically
+            break # Got result
+        except queue.Empty:
+            continue # Still running, check interrupt again
+    # If thread finished but we didn't get a result (e.g., interrupted just before putting in queue)
+    if not summary_a and not result_queue_a.empty():
+         summary_a = result_queue_a.get_nowait()
+    elif not summary_a and generation_interrupt.is_set(): # Check interrupt again if thread finished quickly
+         return "", ""
+    if generation_interrupt.is_set(): # Check between models
         return summary_a, ""
+    # --- Model B ---
+    result_queue_b = queue.Queue()
+    thread_b = threading.Thread(target=run_inference, args=(models[model_b_name], context_text, question, result_queue_b))
+    thread_b.start()
+    summary_b = ""
+    while thread_b.is_alive():
+        if generation_interrupt.is_set():
+            print(f"Interrupting model B ({model_b_name})...")
+            thread_b.join(timeout=1.0)
+            return summary_a, "" # Return summary_a obtained so far
+        try:
+            summary_b = result_queue_b.get(timeout=0.1)
+            break
+        except queue.Empty:
+            continue
+    if not summary_b and not result_queue_b.empty():
+        summary_b = result_queue_b.get_nowait()
+    elif not summary_b and generation_interrupt.is_set():
+         return summary_a, ""
     return summary_a, summary_b
+# Modified run_inference to run in a thread and use a queue for results
+def run_inference(model_name, context, question, result_queue):
     """
+    Run inference using the specified model. Designed to be run in a thread.
+    Puts the result or an error string into the result_queue.
     """
+    # Check interrupt at the very beginning of the thread
     if generation_interrupt.is_set():
+        result_queue.put("")
+        return
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = None
+    tokenizer = None
+    result = ""
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", token=True)
         accepts_sys = (
             "System role not supported" not in tokenizer.chat_template
+            if tokenizer.chat_template else False # Handle missing chat_template
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        # Check interrupt before loading the model
         if generation_interrupt.is_set():
+             result_queue.put("")
+             return
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", token=True
         ).to(device)
+        model.eval() # Set model to evaluation mode
         text_input = format_rag_prompt(question, context, accepts_sys)
+        # Check interrupt before tokenization/template application
         if generation_interrupt.is_set():
+             result_queue.put("")
+             return
         actual_input = tokenizer.apply_chat_template(
             text_input,
             return_tensors="pt",
             tokenize=True,
+            # Consider reducing max_length if context/question is very long
+            # max_length=tokenizer.model_max_length, # Use model's max length
+            # truncation=True, # Ensure truncation if needed
+            max_length=2048, # Keep original max_length for now
             add_generation_prompt=True,
         ).to(device)
+        # Ensure input does not exceed model max length after adding generation prompt
+        # This check might be redundant if tokenizer handles it, but good for safety
+        # if actual_input.shape[1] > tokenizer.model_max_length:
+        #    # Handle too long input - maybe truncate manually or raise error
+        #    print(f"Warning: Input length {actual_input.shape[1]} exceeds model max length {tokenizer.model_max_length}")
+        #    # Simple truncation (might lose important info):
+        #    # actual_input = actual_input[:, -tokenizer.model_max_length:]
         input_length = actual_input.shape[1]
         attention_mask = torch.ones_like(actual_input).to(device)
+        # Check interrupt before generation
         if generation_interrupt.is_set():
+            result_queue.put("")
+            return
         stopping_criteria = StoppingCriteriaList([InterruptCriteria(generation_interrupt)])
         with torch.inference_mode():
             outputs = model.generate(
                 actual_input,
                 attention_mask=attention_mask,
+                max_new_tokens=512,
                 pad_token_id=tokenizer.pad_token_id,
+                stopping_criteria=stopping_criteria,
+                do_sample=True, # Consider adding sampling parameters if needed
+                temperature=0.6,
+                top_p=0.9,
             )
+        # Check interrupt immediately after generation finishes or stops
         if generation_interrupt.is_set():
+            result = "" # Discard potentially partial result if interrupted
+        else:
+            # Decode the generated tokens, excluding the input tokens
+            result = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+        result_queue.put(result)
     except Exception as e:
+        print(f"Error in inference thread for {model_name}: {e}")
+        # Put error message in queue for the main thread to handle/display
+        result_queue.put(f"Error generating response: {str(e)[:100]}...")
     finally:
+        # Clean up resources within the thread
+        del model
+        del tokenizer
+        del actual_input
+        del outputs
         if torch.cuda.is_available():
             torch.cuda.empty_cache()