H2H-eval-comparator

Sleeping

rohansampath commited on Feb 27

Commit

90b1ba7

verified ·

1 Parent(s): 30e6a06

Update mmlu_pro_eval_adapted.py

Files changed (1) hide show

mmlu_pro_eval_adapted.py CHANGED Viewed

@@ -149,7 +149,7 @@ def extract_final(text):
 def batch_inference(llm, sampling_params, inference_batch, tokenizer):
     start = time.time()
     outputs = llm.generate(inference_batch, sampling_params)
-    logging.info("Batch of size: ", str(len(inference_batch)) + ". Time taken: " + str(time.time() - start))
     response_batch = []
     pred_batch = []
     for output in outputs:
@@ -162,7 +162,7 @@ def batch_inference(llm, sampling_params, inference_batch, tokenizer):
 def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer):
     start = time.time()
     outputs = llm.generate(inference_batch, sampling_params)
-    logging.info("Batch of size: ", str(len(inference_batch)) + ". Time taken: " + str(time.time() - start))
     response_batch = []
     pred_batch = []
     input_token_counts = []
@@ -253,7 +253,7 @@ def calculate_accuracy(res):
 @torch.no_grad()
-def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode=True):
     """
     Evaluate model using chain-of-thought prompting.

 def batch_inference(llm, sampling_params, inference_batch, tokenizer):
     start = time.time()
     outputs = llm.generate(inference_batch, sampling_params)
+    logging.info("Batch of size: %s. Time taken: %s", len(inference_batch), time.time() - start)
     response_batch = []
     pred_batch = []
     for output in outputs:
 def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer):
     start = time.time()
     outputs = llm.generate(inference_batch, sampling_params)
+    logging.info("Batch of size: %s. Time taken: %s", len(inference_batch), time.time() - start)
     response_batch = []
     pred_batch = []
     input_token_counts = []
 @torch.no_grad()
+def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode=False):
     """
     Evaluate model using chain-of-thought prompting.