Spaces:
Sleeping
Sleeping
Update mmlu_pro_eval_adapted.py
Browse files- mmlu_pro_eval_adapted.py +3 -3
mmlu_pro_eval_adapted.py
CHANGED
|
@@ -149,7 +149,7 @@ def extract_final(text):
|
|
| 149 |
def batch_inference(llm, sampling_params, inference_batch, tokenizer):
|
| 150 |
start = time.time()
|
| 151 |
outputs = llm.generate(inference_batch, sampling_params)
|
| 152 |
-
logging.info("Batch of size:
|
| 153 |
response_batch = []
|
| 154 |
pred_batch = []
|
| 155 |
for output in outputs:
|
|
@@ -162,7 +162,7 @@ def batch_inference(llm, sampling_params, inference_batch, tokenizer):
|
|
| 162 |
def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer):
|
| 163 |
start = time.time()
|
| 164 |
outputs = llm.generate(inference_batch, sampling_params)
|
| 165 |
-
logging.info("Batch of size:
|
| 166 |
response_batch = []
|
| 167 |
pred_batch = []
|
| 168 |
input_token_counts = []
|
|
@@ -253,7 +253,7 @@ def calculate_accuracy(res):
|
|
| 253 |
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
-
def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode=
|
| 257 |
"""
|
| 258 |
Evaluate model using chain-of-thought prompting.
|
| 259 |
|
|
|
|
| 149 |
def batch_inference(llm, sampling_params, inference_batch, tokenizer):
|
| 150 |
start = time.time()
|
| 151 |
outputs = llm.generate(inference_batch, sampling_params)
|
| 152 |
+
logging.info("Batch of size: %s. Time taken: %s", len(inference_batch), time.time() - start)
|
| 153 |
response_batch = []
|
| 154 |
pred_batch = []
|
| 155 |
for output in outputs:
|
|
|
|
| 162 |
def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer):
|
| 163 |
start = time.time()
|
| 164 |
outputs = llm.generate(inference_batch, sampling_params)
|
| 165 |
+
logging.info("Batch of size: %s. Time taken: %s", len(inference_batch), time.time() - start)
|
| 166 |
response_batch = []
|
| 167 |
pred_batch = []
|
| 168 |
input_token_counts = []
|
|
|
|
| 253 |
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
+
def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode=False):
|
| 257 |
"""
|
| 258 |
Evaluate model using chain-of-thought prompting.
|
| 259 |
|