SLM-RAG-Arena

Running on Zero

App Files Files Community

aizip-dev commited on May 22

Commit

bb6bbaf

verified ·

1 Parent(s): 217c4d4

Update interruption method

Browse files

Files changed (1) hide show

utils/models.py +37 -9

utils/models.py CHANGED Viewed

@@ -62,6 +62,7 @@ def generate_summaries(example, model_a_name, model_b_name):
     Generates summaries for the given example using the assigned models sequentially.
     """
     if generation_interrupt.is_set():
         return "", ""
     context_text = ""
@@ -69,6 +70,11 @@ def generate_summaries(example, model_a_name, model_b_name):
     if "full_contexts" in example and example["full_contexts"]:
         for i, ctx in enumerate(example["full_contexts"]):
             content = ""
             # Extract content from either dict or string
@@ -92,17 +98,22 @@ def generate_summaries(example, model_a_name, model_b_name):
     question = example.get("question", "")
     if generation_interrupt.is_set():
         return "", ""
     # Run model A
     summary_a = run_inference(models[model_a_name], context_text, question)
     if generation_interrupt.is_set():
         return summary_a, ""
     # Run model B
     summary_b = run_inference(models[model_b_name], context_text, question)
     return summary_a, summary_b
@@ -114,6 +125,7 @@ def run_inference(model_name, context, question):
     """
     # Check interrupt at the beginning
     if generation_interrupt.is_set():
         return ""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -134,6 +146,11 @@ def run_inference(model_name, context, question):
         if model_name in tokenizer_cache:
             tokenizer = tokenizer_cache[model_name]
         else:
             # Common arguments for tokenizer loading
             tokenizer_load_args = {"padding_side": "left", "token": True}
@@ -155,6 +172,7 @@ def run_inference(model_name, context, question):
         # Check interrupt before loading the model
         if generation_interrupt.is_set():
             return ""
         # Create interrupt criteria for this generation
@@ -162,19 +180,21 @@ def run_inference(model_name, context, question):
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
         if "bitnet" in model_name.lower():
             bitnet_model = BitNetForCausalLM.from_pretrained(
                 model_name,
-                #device_map="auto",
                 torch_dtype=torch.bfloat16,
-                #trust_remote_code=True,
             )
             pipe = pipeline(
                 "text-generation",
                 model=bitnet_model,
                 tokenizer=tokenizer,
-                #device_map="auto",
-                #trust_remote_code=True,
                 torch_dtype=torch.bfloat16,
                 model_kwargs={
                     "attn_implementation": "eager",
@@ -206,13 +226,20 @@ def run_inference(model_name, context, question):
                 torch_dtype=torch.bfloat16,
             )
         text_input = format_rag_prompt(question, context, accepts_sys)
         if "Gemma-3".lower() in model_name.lower():
             print("REACHED HERE BEFORE GEN")
             result = pipe(
                 text_input,
                 max_new_tokens=512,
-                stopping_criteria=[interrupt_criteria],  # Direct parameter for pipelines
                 generation_kwargs={"skip_special_tokens": True}
             )[0]["generated_text"]
@@ -238,6 +265,7 @@ def run_inference(model_name, context, question):
             with torch.inference_mode():
                 # Check interrupt before generation
                 if generation_interrupt.is_set():
                     return ""
                 output_sequences = model.generate(
@@ -246,7 +274,7 @@ def run_inference(model_name, context, question):
                     max_new_tokens=512,
                     eos_token_id=tokenizer.eos_token_id,
                     pad_token_id=tokenizer.pad_token_id,
-                    stopping_criteria=[interrupt_criteria]  # Direct parameter for model.generate
                 )
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
@@ -278,17 +306,17 @@ def run_inference(model_name, context, question):
             )
             input_length = len(formatted)
-            # Check interrupt before generation
             outputs = pipe(
                 formatted,
                 max_new_tokens=512,
-                stopping_criteria=[interrupt_criteria],  # Direct parameter for pipelines
                 generation_kwargs={"skip_special_tokens": True}
             )
-            # print(outputs[0]['generated_text'])
             result = outputs[0]["generated_text"][input_length:]
     except Exception as e:
         print(f"Error in inference for {model_name}: {e}")
         print(traceback.format_exc())

     Generates summaries for the given example using the assigned models sequentially.
     """
     if generation_interrupt.is_set():
+        print("Generation interrupted before starting")
         return "", ""
     context_text = ""
     if "full_contexts" in example and example["full_contexts"]:
         for i, ctx in enumerate(example["full_contexts"]):
+            # Check interrupt during context processing
+            if generation_interrupt.is_set():
+                print("Generation interrupted during context processing")
+                return "", ""
             content = ""
             # Extract content from either dict or string
     question = example.get("question", "")
     if generation_interrupt.is_set():
+        print("Generation interrupted before model A")
         return "", ""
+    print(f"Starting inference for Model A: {model_a_name}")
     # Run model A
     summary_a = run_inference(models[model_a_name], context_text, question)
     if generation_interrupt.is_set():
+        print("Generation interrupted after model A, before model B")
         return summary_a, ""
+    print(f"Starting inference for Model B: {model_b_name}")
     # Run model B
     summary_b = run_inference(models[model_b_name], context_text, question)
+    print("Both models completed successfully")
     return summary_a, summary_b
     """
     # Check interrupt at the beginning
     if generation_interrupt.is_set():
+        print(f"Inference interrupted before starting for {model_name}")
         return ""
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         if model_name in tokenizer_cache:
             tokenizer = tokenizer_cache[model_name]
         else:
+            # Check interrupt before loading tokenizer
+            if generation_interrupt.is_set():
+                print(f"Inference interrupted before loading tokenizer for {model_name}")
+                return ""
             # Common arguments for tokenizer loading
             tokenizer_load_args = {"padding_side": "left", "token": True}
         # Check interrupt before loading the model
         if generation_interrupt.is_set():
+            print(f"Inference interrupted before loading model {model_name}")
             return ""
         # Create interrupt criteria for this generation
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
+        # Check interrupt before model loading
+        if generation_interrupt.is_set():
+            print(f"Inference interrupted during model loading for {model_name}")
+            return ""
         if "bitnet" in model_name.lower():
             bitnet_model = BitNetForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype=torch.bfloat16,
             )
             pipe = pipeline(
                 "text-generation",
                 model=bitnet_model,
                 tokenizer=tokenizer,
                 torch_dtype=torch.bfloat16,
                 model_kwargs={
                     "attn_implementation": "eager",
                 torch_dtype=torch.bfloat16,
             )
+        # Final interrupt check before generation
+        if generation_interrupt.is_set():
+            print(f"Inference interrupted before generation for {model_name}")
+            return ""
         text_input = format_rag_prompt(question, context, accepts_sys)
+        print(f"Starting generation for {model_name}")
         if "Gemma-3".lower() in model_name.lower():
             print("REACHED HERE BEFORE GEN")
             result = pipe(
                 text_input,
                 max_new_tokens=512,
+                stopping_criteria=[interrupt_criteria],
                 generation_kwargs={"skip_special_tokens": True}
             )[0]["generated_text"]
             with torch.inference_mode():
                 # Check interrupt before generation
                 if generation_interrupt.is_set():
+                    print(f"Inference interrupted before torch generation for {model_name}")
                     return ""
                 output_sequences = model.generate(
                     max_new_tokens=512,
                     eos_token_id=tokenizer.eos_token_id,
                     pad_token_id=tokenizer.pad_token_id,
+                    stopping_criteria=[interrupt_criteria]
                 )
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             )
             input_length = len(formatted)
             outputs = pipe(
                 formatted,
                 max_new_tokens=512,
+                stopping_criteria=[interrupt_criteria],
                 generation_kwargs={"skip_special_tokens": True}
             )
             result = outputs[0]["generated_text"][input_length:]
+        print(f"Generation completed for {model_name}")
     except Exception as e:
         print(f"Error in inference for {model_name}: {e}")
         print(traceback.format_exc())