Gül Sena Altıntaş
commited on
Commit
·
15729bc
1
Parent(s):
279fdab
Now accepts multiline!
Browse files- app.py +48 -87
- serve_on_killarney.sh +2 -2
app.py
CHANGED
|
@@ -63,8 +63,18 @@ PREDEFINED_MODELS = [
|
|
| 63 |
model_cache = {}
|
| 64 |
|
| 65 |
|
| 66 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
"""Parse the input dataset text into structured questions"""
|
|
|
|
| 68 |
|
| 69 |
def clean_cell(s: str) -> str:
|
| 70 |
return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
|
|
@@ -75,12 +85,6 @@ def parse_dataset(text):
|
|
| 75 |
# Normalize line endings
|
| 76 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 77 |
|
| 78 |
-
# Detect delimiter from first non-empty line
|
| 79 |
-
for line in text.splitlines():
|
| 80 |
-
if line.strip():
|
| 81 |
-
delimiter = "\t" if "\t" in line else ","
|
| 82 |
-
break
|
| 83 |
-
|
| 84 |
# Use csv.reader to handle quoted multi-line cells
|
| 85 |
reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
|
| 86 |
|
|
@@ -112,67 +116,6 @@ def parse_dataset(text):
|
|
| 112 |
return questions, error_msg
|
| 113 |
|
| 114 |
|
| 115 |
-
def parse_datasetold(text):
|
| 116 |
-
"""Parse the input dataset text into structured questions"""
|
| 117 |
-
if not text.strip():
|
| 118 |
-
return [], "Please enter your dataset"
|
| 119 |
-
|
| 120 |
-
# Detect delimiter
|
| 121 |
-
sample_line = text.splitlines()[0]
|
| 122 |
-
delimiter = "\t" if "\t" in sample_line else ","
|
| 123 |
-
|
| 124 |
-
# Use csv.reader to correctly parse quotes & newlines
|
| 125 |
-
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
| 126 |
-
|
| 127 |
-
questions = []
|
| 128 |
-
errors = []
|
| 129 |
-
for i, row in enumerate(reader, 1):
|
| 130 |
-
parts = [clean_cell(p) for p in row if p.strip()]
|
| 131 |
-
if len(parts) < 5:
|
| 132 |
-
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
| 133 |
-
continue
|
| 134 |
-
|
| 135 |
-
question = {
|
| 136 |
-
"question": parts[0],
|
| 137 |
-
"correct_answer": parts[1],
|
| 138 |
-
"choices": [parts[2], parts[3], parts[4]],
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
-
if question["correct_answer"] not in question["choices"]:
|
| 142 |
-
question["choices"].append(question["correct_answer"])
|
| 143 |
-
|
| 144 |
-
questions.append(question)
|
| 145 |
-
|
| 146 |
-
error_msg = "\n".join(errors) if errors else ""
|
| 147 |
-
return questions, error_msg
|
| 148 |
-
for i, line in enumerate(reader, 1):
|
| 149 |
-
# for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
|
| 150 |
-
line = line.strip()
|
| 151 |
-
if not line:
|
| 152 |
-
continue
|
| 153 |
-
|
| 154 |
-
parts = [clean_text(part) for part in line.split(delimiter)]
|
| 155 |
-
|
| 156 |
-
if len(parts) < 5:
|
| 157 |
-
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
| 158 |
-
continue
|
| 159 |
-
|
| 160 |
-
question = {
|
| 161 |
-
"question": parts[0],
|
| 162 |
-
"correct_answer": parts[1],
|
| 163 |
-
"choices": [parts[2], parts[3], parts[4]],
|
| 164 |
-
}
|
| 165 |
-
|
| 166 |
-
# Ensure correct answer is in choices
|
| 167 |
-
if question["correct_answer"] not in question["choices"]:
|
| 168 |
-
question["choices"].append(question["correct_answer"])
|
| 169 |
-
|
| 170 |
-
questions.append(question)
|
| 171 |
-
|
| 172 |
-
error_msg = "\n".join(errors) if errors else ""
|
| 173 |
-
return questions, error_msg
|
| 174 |
-
|
| 175 |
-
|
| 176 |
def setup_tokenizer(model_path):
|
| 177 |
tokenizer_name = model_path
|
| 178 |
if "supertoken" in model_path:
|
|
@@ -403,7 +346,11 @@ def evaluate_model_on_questions(model_path, questions, progress_callback=None):
|
|
| 403 |
|
| 404 |
|
| 405 |
def run_evaluation(
|
| 406 |
-
dataset_text,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
):
|
| 408 |
"""Main evaluation function"""
|
| 409 |
if not dataset_text.strip():
|
|
@@ -447,7 +394,7 @@ def run_evaluation(
|
|
| 447 |
)
|
| 448 |
|
| 449 |
# Parse dataset
|
| 450 |
-
questions, parse_error = parse_dataset(dataset_text)
|
| 451 |
|
| 452 |
if parse_error:
|
| 453 |
return (
|
|
@@ -976,22 +923,18 @@ def generate_csv_summary(questions, results, summary_stats):
|
|
| 976 |
# Sample datasets for quick testing
|
| 977 |
SAMPLE_DATASETS = {
|
| 978 |
"Custom (enter below)": "",
|
| 979 |
-
"LP": """
|
| 980 |
-
In which country is Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch located? Wales Germany France Scotland
|
| 981 |
In which country is Llanfair pwllgwyngyll located? Wales Germany France Scotland
|
| 982 |
In which country is Llanfair PG located? Wales Germany France Scotland""",
|
| 983 |
-
"Simple Math": """
|
| 984 |
-
What is
|
| 985 |
-
What is
|
| 986 |
-
What is
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
What is the capital of
|
| 990 |
-
What is the capital of
|
| 991 |
-
What is the
|
| 992 |
-
What is the capital of Australia?,Canberra,Sydney,Melbourne,Perth""",
|
| 993 |
-
"Science Quiz": """Question,Correct Answer,Choice1,Choice2,Choice3
|
| 994 |
-
What is the chemical symbol for gold?,Au,Ag,Ca,K
|
| 995 |
Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars
|
| 996 |
What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s
|
| 997 |
What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen""",
|
|
@@ -1035,11 +978,14 @@ css = """
|
|
| 1035 |
# }
|
| 1036 |
"""
|
| 1037 |
|
|
|
|
| 1038 |
# Create Gradio interface
|
| 1039 |
with gr.Blocks(
|
| 1040 |
title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
|
| 1041 |
) as demo:
|
| 1042 |
-
gr.
|
|
|
|
|
|
|
| 1043 |
# 🤖 Model Performance Comparison Tool
|
| 1044 |
|
| 1045 |
Compare LLM performance on multiple-choice questions using Hugging Face models.
|
|
@@ -1052,7 +998,17 @@ with gr.Blocks(
|
|
| 1052 |
- Detailed question-by-question results
|
| 1053 |
- Performance charts and statistics
|
| 1054 |
""")
|
| 1055 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1056 |
with gr.Row():
|
| 1057 |
with gr.Column(scale=2):
|
| 1058 |
# Sample dataset selector
|
|
@@ -1178,7 +1134,12 @@ bigscience/bloom-560m""",
|
|
| 1178 |
|
| 1179 |
evaluate_btn.click(
|
| 1180 |
fn=run_evaluation,
|
| 1181 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1182 |
outputs=[
|
| 1183 |
summary_output,
|
| 1184 |
detailed_results,
|
|
|
|
| 63 |
model_cache = {}
|
| 64 |
|
| 65 |
|
| 66 |
+
def normalize_delimiter(delim: str) -> str:
|
| 67 |
+
delim = delim.strip()
|
| 68 |
+
if delim == "\\t": # user typed literal \t
|
| 69 |
+
return "\t"
|
| 70 |
+
if len(delim) != 1:
|
| 71 |
+
raise ValueError(f"Delimiter must be a single character, got {repr(delim)}")
|
| 72 |
+
return delim
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def parse_dataset(text, delimiter: str = "\t"):
|
| 76 |
"""Parse the input dataset text into structured questions"""
|
| 77 |
+
delimiter = normalize_delimiter(delimiter)
|
| 78 |
|
| 79 |
def clean_cell(s: str) -> str:
|
| 80 |
return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
|
|
|
|
| 85 |
# Normalize line endings
|
| 86 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# Use csv.reader to handle quoted multi-line cells
|
| 89 |
reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
|
| 90 |
|
|
|
|
| 116 |
return questions, error_msg
|
| 117 |
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def setup_tokenizer(model_path):
|
| 120 |
tokenizer_name = model_path
|
| 121 |
if "supertoken" in model_path:
|
|
|
|
| 346 |
|
| 347 |
|
| 348 |
def run_evaluation(
|
| 349 |
+
dataset_text,
|
| 350 |
+
selected_predefined,
|
| 351 |
+
custom_models_text="",
|
| 352 |
+
delimiter: str = "\t",
|
| 353 |
+
progress=gr.Progress(),
|
| 354 |
):
|
| 355 |
"""Main evaluation function"""
|
| 356 |
if not dataset_text.strip():
|
|
|
|
| 394 |
)
|
| 395 |
|
| 396 |
# Parse dataset
|
| 397 |
+
questions, parse_error = parse_dataset(dataset_text, delimiter=delimiter)
|
| 398 |
|
| 399 |
if parse_error:
|
| 400 |
return (
|
|
|
|
| 923 |
# Sample datasets for quick testing
|
| 924 |
SAMPLE_DATASETS = {
|
| 925 |
"Custom (enter below)": "",
|
| 926 |
+
"LP": """In which country is Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch located? Wales Germany France Scotland
|
|
|
|
| 927 |
In which country is Llanfair pwllgwyngyll located? Wales Germany France Scotland
|
| 928 |
In which country is Llanfair PG located? Wales Germany France Scotland""",
|
| 929 |
+
"Simple Math": """What is 2+2? 4 3 2 5
|
| 930 |
+
What is 5*3? 15 12 16 18
|
| 931 |
+
What is 10-7? 3 7 4 2
|
| 932 |
+
What is 8/2? 4 3 2 5""",
|
| 933 |
+
"World Capitals": """What is the capital of France? Paris London Berlin Rome
|
| 934 |
+
What is the capital of Japan? Tokyo Seoul Beijing Bangkok
|
| 935 |
+
What is the capital of Brazil? Brasília Rio de Janeiro São Paulo Salvador
|
| 936 |
+
What is the capital of Australia? Canberra Sydney Melbourne Perth""",
|
| 937 |
+
"Science Quiz": """What is the chemical symbol for gold?,Au,Ag,Ca,K
|
|
|
|
|
|
|
|
|
|
| 938 |
Which planet is closest to the Sun?,Mercury,Venus,Earth,Mars
|
| 939 |
What is the speed of light?,299792458 m/s,300000000 m/s,2992458 m/s,299000000 m/s
|
| 940 |
What gas do plants absorb from the atmosphere?,Carbon dioxide,Oxygen,Nitrogen,Hydrogen""",
|
|
|
|
| 978 |
# }
|
| 979 |
"""
|
| 980 |
|
| 981 |
+
|
| 982 |
# Create Gradio interface
|
| 983 |
with gr.Blocks(
|
| 984 |
title="🤖 Model Performance Comparison", theme=gr.themes.Soft(), css=css
|
| 985 |
) as demo:
|
| 986 |
+
with gr.Row():
|
| 987 |
+
with gr.Column(scale=2):
|
| 988 |
+
gr.Markdown("""
|
| 989 |
# 🤖 Model Performance Comparison Tool
|
| 990 |
|
| 991 |
Compare LLM performance on multiple-choice questions using Hugging Face models.
|
|
|
|
| 998 |
- Detailed question-by-question results
|
| 999 |
- Performance charts and statistics
|
| 1000 |
""")
|
| 1001 |
+
with gr.Column(scale=1):
|
| 1002 |
+
# with gr.Accordion("Delimiter Options"):
|
| 1003 |
+
gr.Markdown("""
|
| 1004 |
+
Enter the delimiter used in your dataset:
|
| 1005 |
+
""")
|
| 1006 |
+
delimiter_selector = gr.Textbox(
|
| 1007 |
+
label="Delimiter",
|
| 1008 |
+
placeholder="Enter a delimiter, e.g., , or \\t",
|
| 1009 |
+
value="\\t", # default
|
| 1010 |
+
lines=1,
|
| 1011 |
+
)
|
| 1012 |
with gr.Row():
|
| 1013 |
with gr.Column(scale=2):
|
| 1014 |
# Sample dataset selector
|
|
|
|
| 1134 |
|
| 1135 |
evaluate_btn.click(
|
| 1136 |
fn=run_evaluation,
|
| 1137 |
+
inputs=[
|
| 1138 |
+
dataset_input,
|
| 1139 |
+
predefined_selector,
|
| 1140 |
+
custom_models_input,
|
| 1141 |
+
delimiter_selector,
|
| 1142 |
+
],
|
| 1143 |
outputs=[
|
| 1144 |
summary_output,
|
| 1145 |
detailed_results,
|
serve_on_killarney.sh
CHANGED
|
@@ -16,8 +16,8 @@ NODES=1
|
|
| 16 |
NTASKS_PER_NODE=1
|
| 17 |
CPUS_PER_TASK=4
|
| 18 |
### request more memory to run on more models
|
| 19 |
-
MEM="
|
| 20 |
-
TIME="
|
| 21 |
GRADIO_PORT=7861
|
| 22 |
script_location="$APP_DIR/$SCRIPT_NAME"
|
| 23 |
|
|
|
|
| 16 |
NTASKS_PER_NODE=1
|
| 17 |
CPUS_PER_TASK=4
|
| 18 |
### request more memory to run on more models
|
| 19 |
+
MEM="64G"
|
| 20 |
+
TIME="06:00:00"
|
| 21 |
GRADIO_PORT=7861
|
| 22 |
script_location="$APP_DIR/$SCRIPT_NAME"
|
| 23 |
|