Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -200,17 +200,11 @@ def calculate_weighted_score(scores: Dict[str, float]) -> float:
|
|
| 200 |
|
| 201 |
return weighted_sum / total_weight if total_weight > 0 else 0
|
| 202 |
|
| 203 |
-
def evaluate_text(
|
| 204 |
-
"""Evaluate a
|
| 205 |
-
#
|
| 206 |
-
reference_text = clean_text(
|
| 207 |
-
|
| 208 |
-
# Generate candidate using the selected model and prompt
|
| 209 |
-
prompt = prompt_template.replace("{text}", raw_input)
|
| 210 |
-
candidate = model_provider.generate(prompt)
|
| 211 |
-
|
| 212 |
-
# Clean candidate output for consistent evaluation
|
| 213 |
-
cleaned_candidate = clean_text(candidate)
|
| 214 |
|
| 215 |
# Initialize evaluation metrics
|
| 216 |
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
|
@@ -223,7 +217,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 223 |
smooth = SmoothingFunction().method4
|
| 224 |
bleu = sentence_bleu(
|
| 225 |
[reference_text.split()],
|
| 226 |
-
|
| 227 |
smoothing_function=smooth
|
| 228 |
)
|
| 229 |
results["BLEU"] = bleu
|
|
@@ -233,7 +227,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 233 |
|
| 234 |
# ROUGE Score
|
| 235 |
try:
|
| 236 |
-
rouge_scores = scorer.score(reference_text,
|
| 237 |
rouge = (rouge_scores['rouge1'].fmeasure +
|
| 238 |
rouge_scores['rouge2'].fmeasure +
|
| 239 |
rouge_scores['rougeL'].fmeasure) / 3
|
|
@@ -246,7 +240,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 246 |
try:
|
| 247 |
meteor = meteor_score(
|
| 248 |
[reference_text.split()],
|
| 249 |
-
|
| 250 |
)
|
| 251 |
results["METEOR"] = meteor
|
| 252 |
except Exception as e:
|
|
@@ -256,7 +250,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 256 |
# BERTScore
|
| 257 |
try:
|
| 258 |
P, R, F1 = bert_score(
|
| 259 |
-
[
|
| 260 |
[reference_text],
|
| 261 |
lang="en",
|
| 262 |
verbose=False
|
|
@@ -276,8 +270,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 276 |
relevancy_prompt = f"""
|
| 277 |
On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
|
| 278 |
|
| 279 |
-
Input: {
|
| 280 |
-
Candidate: {
|
| 281 |
|
| 282 |
Provide only a single number between 0.0 and 1.0 with no explanation.
|
| 283 |
"""
|
|
@@ -292,8 +286,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 292 |
faithfulness_prompt = f"""
|
| 293 |
On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
|
| 294 |
|
| 295 |
-
Input: {
|
| 296 |
-
Candidate: {
|
| 297 |
|
| 298 |
Provide only a single number between 0.0 and 1.0 with no explanation.
|
| 299 |
"""
|
|
@@ -309,8 +303,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 309 |
On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
|
| 310 |
Consider accuracy, completeness, fluency, and professionalism.
|
| 311 |
|
| 312 |
-
Input: {
|
| 313 |
-
Candidate: {
|
| 314 |
|
| 315 |
Provide only a single number between 0.0 and 1.0 with no explanation.
|
| 316 |
"""
|
|
@@ -349,20 +343,17 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 349 |
interpretation = "Poor performance (F) - likely needs complete rewriting"
|
| 350 |
|
| 351 |
return {
|
| 352 |
-
"candidate":
|
| 353 |
"metrics": results,
|
| 354 |
"normalized": normalized_scores,
|
| 355 |
"weighted_score": weighted_score,
|
| 356 |
"interpretation": interpretation
|
| 357 |
}
|
| 358 |
|
| 359 |
-
def process_input(
|
| 360 |
-
"""Process
|
| 361 |
-
if
|
| 362 |
-
return "
|
| 363 |
-
|
| 364 |
-
if not input_text and not file_upload:
|
| 365 |
-
return "Please provide input text or upload a file.", [], ""
|
| 366 |
|
| 367 |
# Determine model provider
|
| 368 |
if model_choice == "Gemini":
|
|
@@ -374,199 +365,63 @@ def process_input(input_text: str, file_upload, model_choice: str, prompt_choice
|
|
| 374 |
|
| 375 |
# Check if model is available
|
| 376 |
if not model_provider.available:
|
| 377 |
-
return f"Error: {model_choice} is not properly configured. Check your API key."
|
| 378 |
-
|
| 379 |
-
# Get prompt template
|
| 380 |
-
prompt_template = PROMPT_TEMPLATES[prompt_choice]
|
| 381 |
|
| 382 |
-
# Process
|
| 383 |
-
if
|
| 384 |
progress(0.1, desc="Starting evaluation...")
|
| 385 |
-
time.sleep(0.
|
| 386 |
-
|
| 387 |
-
progress(0.3, desc="Generating rewritten content...")
|
| 388 |
-
time.sleep(0.2)
|
| 389 |
-
|
| 390 |
-
progress(0.6, desc="Calculating metrics...")
|
| 391 |
-
result = evaluate_text(input_text, model_provider, prompt_template)
|
| 392 |
|
| 393 |
-
progress(0.
|
| 394 |
-
time.sleep(0.
|
| 395 |
|
| 396 |
-
#
|
| 397 |
-
|
| 398 |
-
["Metric", "Raw Score", "Normalized"],
|
| 399 |
-
["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
|
| 400 |
-
["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
|
| 401 |
-
["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
|
| 402 |
-
["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
|
| 403 |
-
["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
|
| 404 |
-
["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
|
| 405 |
-
["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
|
| 406 |
-
["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
|
| 407 |
-
]
|
| 408 |
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
)
|
| 414 |
-
|
| 415 |
-
# Process file upload
|
| 416 |
-
if file_upload:
|
| 417 |
-
progress(0.1, desc="Reading file...")
|
| 418 |
-
time.sleep(0.2)
|
| 419 |
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
df = pd.read_csv(file_upload.name)
|
| 423 |
-
progress(0.3, desc="Processing entries...")
|
| 424 |
-
time.sleep(0.2)
|
| 425 |
-
except Exception as e:
|
| 426 |
-
return f"Error reading file: {str(e)}", [], ""
|
| 427 |
|
| 428 |
-
#
|
| 429 |
-
|
| 430 |
-
results = []
|
| 431 |
-
detailed_results = []
|
| 432 |
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
|
| 436 |
-
text = str(row[text_column])
|
| 437 |
-
|
| 438 |
-
try:
|
| 439 |
-
result = evaluate_text(text, model_provider, prompt_template)
|
| 440 |
-
|
| 441 |
-
# Add to results
|
| 442 |
-
results.append(result["weighted_score"])
|
| 443 |
-
|
| 444 |
-
# Store detailed results
|
| 445 |
-
detailed_results.append({
|
| 446 |
-
"input_preview": text[:100] + "..." if len(text) > 100 else text,
|
| 447 |
-
"weighted_score": result["weighted_score"],
|
| 448 |
-
"interpretation": result["interpretation"],
|
| 449 |
-
"candidate": result["candidate"]
|
| 450 |
-
})
|
| 451 |
-
except Exception as e:
|
| 452 |
-
print(f"Error processing entry {i}: {str(e)}")
|
| 453 |
-
results.append(0.0)
|
| 454 |
-
detailed_results.append({
|
| 455 |
-
"input_preview": text[:100] + "..." if len(text) > 100 else text,
|
| 456 |
-
"weighted_score": 0.0,
|
| 457 |
-
"interpretation": "Error processing this entry",
|
| 458 |
-
"candidate": ""
|
| 459 |
-
})
|
| 460 |
|
| 461 |
-
|
|
|
|
| 462 |
time.sleep(0.2)
|
| 463 |
|
| 464 |
-
#
|
| 465 |
-
|
| 466 |
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
# Create metrics table for summary
|
| 492 |
-
metrics_table = [
|
| 493 |
-
["Metric", "Value"],
|
| 494 |
-
["Entries Processed", f"{len(results)}"],
|
| 495 |
-
["Successful Entries", f"{len(valid_scores)}"],
|
| 496 |
-
["Average Score", f"{avg_score:.4f}"],
|
| 497 |
-
["Best Score", f"{max_score:.4f}"],
|
| 498 |
-
["Worst Score", f"{min_score:.4f}"],
|
| 499 |
-
["Overall Assessment", summary]
|
| 500 |
-
]
|
| 501 |
-
|
| 502 |
-
return (
|
| 503 |
-
"Batch processing complete. Use the 'Show Details' button to see individual results.",
|
| 504 |
-
metrics_table,
|
| 505 |
-
summary_text
|
| 506 |
-
)
|
| 507 |
-
else:
|
| 508 |
-
return (
|
| 509 |
-
"No successful evaluations. Check your API configuration and input data.",
|
| 510 |
-
[["Error", "All evaluations failed"]],
|
| 511 |
-
"Error: No successful evaluations. Check your API configuration and input data."
|
| 512 |
-
)
|
| 513 |
-
|
| 514 |
-
def show_detailed_results(input_text, file_upload, model_choice, prompt_choice, progress=gr.Progress()):
|
| 515 |
-
"""Show detailed results for batch processing"""
|
| 516 |
-
if not file_upload:
|
| 517 |
-
return "No file uploaded for batch processing."
|
| 518 |
-
|
| 519 |
-
progress(0.1, desc="Reading file...")
|
| 520 |
-
time.sleep(0.1)
|
| 521 |
-
|
| 522 |
-
# Read the file
|
| 523 |
-
df = pd.read_csv(file_upload.name)
|
| 524 |
-
text_column = df.columns[0]
|
| 525 |
-
|
| 526 |
-
progress(0.3, desc="Determining model provider...")
|
| 527 |
-
time.sleep(0.1)
|
| 528 |
-
|
| 529 |
-
# Determine model provider
|
| 530 |
-
if model_choice == "Gemini":
|
| 531 |
-
model_provider = GeminiProvider("gemini-1.5-flash-latest")
|
| 532 |
-
elif model_choice == "Llama-3-70b":
|
| 533 |
-
model_provider = GroqProvider("llama3-70b-8192")
|
| 534 |
-
else: # Llama-3-8b
|
| 535 |
-
model_provider = GroqProvider("llama3-8b-8192")
|
| 536 |
-
|
| 537 |
-
progress(0.5, desc="Getting prompt template...")
|
| 538 |
-
time.sleep(0.1)
|
| 539 |
-
|
| 540 |
-
# Get prompt template
|
| 541 |
-
prompt_template = PROMPT_TEMPLATES[prompt_choice]
|
| 542 |
-
|
| 543 |
-
progress(0.7, desc="Processing entries...")
|
| 544 |
-
time.sleep(0.1)
|
| 545 |
-
|
| 546 |
-
# Process each entry
|
| 547 |
-
results = []
|
| 548 |
-
for i, row in enumerate(df.iterrows()):
|
| 549 |
-
_, row = row # Unpack the tuple
|
| 550 |
-
text = str(row[text_column])
|
| 551 |
-
try:
|
| 552 |
-
result = evaluate_text(text, model_provider, prompt_template)
|
| 553 |
-
results.append({
|
| 554 |
-
"Input Preview": text[:100] + "..." if len(text) > 100 else text,
|
| 555 |
-
"Weighted Score": f"{result['weighted_score']:.4f}",
|
| 556 |
-
"Interpretation": result['interpretation'],
|
| 557 |
-
"Candidate Text": result['candidate']
|
| 558 |
-
})
|
| 559 |
-
except:
|
| 560 |
-
results.append({
|
| 561 |
-
"Input Preview": text[:100] + "..." if len(text) > 100 else text,
|
| 562 |
-
"Weighted Score": "Error",
|
| 563 |
-
"Interpretation": "Processing error",
|
| 564 |
-
"Candidate Text": ""
|
| 565 |
-
})
|
| 566 |
-
progress(0.7 + (i + 1) / len(df) * 0.3, desc=f"Processing entry {i+1}/{len(df)}")
|
| 567 |
-
|
| 568 |
-
progress(1.0, desc="Completed!")
|
| 569 |
-
return gr.Dataframe(value=pd.DataFrame(results))
|
| 570 |
|
| 571 |
# Create Gradio interface
|
| 572 |
with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
|
|
@@ -576,17 +431,38 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
|
|
| 576 |
with gr.Row():
|
| 577 |
with gr.Column(scale=1):
|
| 578 |
gr.Markdown("### π₯ Input Options")
|
| 579 |
-
|
| 580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
lines=10,
|
| 582 |
-
placeholder="Enter text to evaluate...",
|
| 583 |
-
elem_id="
|
| 584 |
)
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
)
|
| 591 |
|
| 592 |
gr.Markdown("### βοΈ Configuration")
|
|
@@ -607,12 +483,22 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
|
|
| 607 |
submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
|
| 608 |
|
| 609 |
with gr.Column(scale=2):
|
| 610 |
-
gr.Markdown("###
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
|
| 617 |
gr.Markdown("### π Evaluation Metrics")
|
| 618 |
metrics_output = gr.Dataframe(
|
|
@@ -626,34 +512,12 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
|
|
| 626 |
label="Summary",
|
| 627 |
elem_id="summary-output"
|
| 628 |
)
|
| 629 |
-
|
| 630 |
-
detailed_results_btn = gr.Button("Show Detailed Results (Batch)", visible=False)
|
| 631 |
-
detailed_results = gr.Dataframe(visible=False)
|
| 632 |
-
|
| 633 |
-
# Update visibility of detailed results button
|
| 634 |
-
def update_detailed_results_visibility(file_upload, summary):
|
| 635 |
-
has_file = file_upload is not None
|
| 636 |
-
has_batch_results = "Processed" in summary and "entries" in summary
|
| 637 |
-
return gr.update(visible=has_file and has_batch_results)
|
| 638 |
|
| 639 |
# Event handlers
|
| 640 |
submit_btn.click(
|
| 641 |
fn=process_input,
|
| 642 |
-
inputs=[
|
| 643 |
-
outputs=[candidate_output, metrics_output, summary_output]
|
| 644 |
-
).then(
|
| 645 |
-
fn=update_detailed_results_visibility,
|
| 646 |
-
inputs=[file_upload, summary_output],
|
| 647 |
-
outputs=detailed_results_btn
|
| 648 |
-
)
|
| 649 |
-
|
| 650 |
-
detailed_results_btn.click(
|
| 651 |
-
fn=show_detailed_results,
|
| 652 |
-
inputs=[input_text, file_upload, model_choice, prompt_choice],
|
| 653 |
-
outputs=detailed_results
|
| 654 |
-
).then(
|
| 655 |
-
fn=lambda: gr.update(visible=True),
|
| 656 |
-
outputs=detailed_results
|
| 657 |
)
|
| 658 |
|
| 659 |
# Add interpretation guide in an accordion
|
|
|
|
| 200 |
|
| 201 |
return weighted_sum / total_weight if total_weight > 0 else 0
|
| 202 |
|
| 203 |
+
def evaluate_text(reference_text: str, candidate_text: str) -> Dict[str, Any]:
|
| 204 |
+
"""Evaluate a candidate text against a reference text"""
|
| 205 |
+
# Clean both texts for consistent evaluation
|
| 206 |
+
reference_text = clean_text(reference_text)
|
| 207 |
+
candidate_text = clean_text(candidate_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
# Initialize evaluation metrics
|
| 210 |
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
|
|
|
| 217 |
smooth = SmoothingFunction().method4
|
| 218 |
bleu = sentence_bleu(
|
| 219 |
[reference_text.split()],
|
| 220 |
+
candidate_text.split(),
|
| 221 |
smoothing_function=smooth
|
| 222 |
)
|
| 223 |
results["BLEU"] = bleu
|
|
|
|
| 227 |
|
| 228 |
# ROUGE Score
|
| 229 |
try:
|
| 230 |
+
rouge_scores = scorer.score(reference_text, candidate_text)
|
| 231 |
rouge = (rouge_scores['rouge1'].fmeasure +
|
| 232 |
rouge_scores['rouge2'].fmeasure +
|
| 233 |
rouge_scores['rougeL'].fmeasure) / 3
|
|
|
|
| 240 |
try:
|
| 241 |
meteor = meteor_score(
|
| 242 |
[reference_text.split()],
|
| 243 |
+
candidate_text.split()
|
| 244 |
)
|
| 245 |
results["METEOR"] = meteor
|
| 246 |
except Exception as e:
|
|
|
|
| 250 |
# BERTScore
|
| 251 |
try:
|
| 252 |
P, R, F1 = bert_score(
|
| 253 |
+
[candidate_text],
|
| 254 |
[reference_text],
|
| 255 |
lang="en",
|
| 256 |
verbose=False
|
|
|
|
| 270 |
relevancy_prompt = f"""
|
| 271 |
On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
|
| 272 |
|
| 273 |
+
Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
|
| 274 |
+
Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
|
| 275 |
|
| 276 |
Provide only a single number between 0.0 and 1.0 with no explanation.
|
| 277 |
"""
|
|
|
|
| 286 |
faithfulness_prompt = f"""
|
| 287 |
On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
|
| 288 |
|
| 289 |
+
Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
|
| 290 |
+
Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
|
| 291 |
|
| 292 |
Provide only a single number between 0.0 and 1.0 with no explanation.
|
| 293 |
"""
|
|
|
|
| 303 |
On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
|
| 304 |
Consider accuracy, completeness, fluency, and professionalism.
|
| 305 |
|
| 306 |
+
Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
|
| 307 |
+
Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
|
| 308 |
|
| 309 |
Provide only a single number between 0.0 and 1.0 with no explanation.
|
| 310 |
"""
|
|
|
|
| 343 |
interpretation = "Poor performance (F) - likely needs complete rewriting"
|
| 344 |
|
| 345 |
return {
|
| 346 |
+
"candidate": candidate_text,
|
| 347 |
"metrics": results,
|
| 348 |
"normalized": normalized_scores,
|
| 349 |
"weighted_score": weighted_score,
|
| 350 |
"interpretation": interpretation
|
| 351 |
}
|
| 352 |
|
| 353 |
+
def process_input(input_mode: str, reference_text: str, candidate_text: str, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, str, List[List[str]], str]:
|
| 354 |
+
"""Process input based on selected mode"""
|
| 355 |
+
if not reference_text:
|
| 356 |
+
return "", "", [], "Please provide reference text."
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
# Determine model provider
|
| 359 |
if model_choice == "Gemini":
|
|
|
|
| 365 |
|
| 366 |
# Check if model is available
|
| 367 |
if not model_provider.available:
|
| 368 |
+
return "", "", [], f"Error: {model_choice} is not properly configured. Check your API key."
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
+
# Process based on input mode
|
| 371 |
+
if input_mode == "Reference Only (Generate Candidate)":
|
| 372 |
progress(0.1, desc="Starting evaluation...")
|
| 373 |
+
time.sleep(0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
+
progress(0.3, desc="Generating rewritten content using prompt...")
|
| 376 |
+
time.sleep(0.1)
|
| 377 |
|
| 378 |
+
# Get prompt template
|
| 379 |
+
prompt_template = PROMPT_TEMPLATES[prompt_choice]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
+
# Generate candidate using the selected model and prompt
|
| 382 |
+
prompt = prompt_template.replace("{text}", reference_text)
|
| 383 |
+
candidate = model_provider.generate(prompt)
|
| 384 |
+
cleaned_candidate = clean_text(candidate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
+
progress(0.6, desc="Calculating metrics...")
|
| 387 |
+
time.sleep(0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
|
| 389 |
+
# Evaluate the generated candidate
|
| 390 |
+
result = evaluate_text(reference_text, cleaned_candidate)
|
|
|
|
|
|
|
| 391 |
|
| 392 |
+
progress(0.9, desc="Finalizing results...")
|
| 393 |
+
time.sleep(0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
+
else: # "Both Reference and Candidate"
|
| 396 |
+
progress(0.3, desc="Calculating metrics...")
|
| 397 |
time.sleep(0.2)
|
| 398 |
|
| 399 |
+
# Evaluate the provided candidate
|
| 400 |
+
result = evaluate_text(reference_text, candidate_text)
|
| 401 |
|
| 402 |
+
progress(0.8, desc="Finalizing results...")
|
| 403 |
+
time.sleep(0.1)
|
| 404 |
+
cleaned_candidate = clean_text(candidate_text)
|
| 405 |
+
|
| 406 |
+
# Format metrics for display
|
| 407 |
+
metrics_table = [
|
| 408 |
+
["Metric", "Raw Score", "Normalized"],
|
| 409 |
+
["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
|
| 410 |
+
["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
|
| 411 |
+
["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
|
| 412 |
+
["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
|
| 413 |
+
["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
|
| 414 |
+
["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
|
| 415 |
+
["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
|
| 416 |
+
["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
|
| 417 |
+
]
|
| 418 |
+
|
| 419 |
+
return (
|
| 420 |
+
reference_text,
|
| 421 |
+
result["candidate"],
|
| 422 |
+
metrics_table,
|
| 423 |
+
f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
|
| 424 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
| 426 |
# Create Gradio interface
|
| 427 |
with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
|
|
|
|
| 431 |
with gr.Row():
|
| 432 |
with gr.Column(scale=1):
|
| 433 |
gr.Markdown("### π₯ Input Options")
|
| 434 |
+
|
| 435 |
+
input_mode = gr.Radio(
|
| 436 |
+
["Reference Only (Generate Candidate)", "Both Reference and Candidate"],
|
| 437 |
+
label="Input Mode",
|
| 438 |
+
value="Reference Only (Generate Candidate)",
|
| 439 |
+
elem_id="input-mode"
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
reference_text = gr.Textbox(
|
| 443 |
+
label="Reference Text",
|
| 444 |
lines=10,
|
| 445 |
+
placeholder="Enter reference text to evaluate against...",
|
| 446 |
+
elem_id="reference-text"
|
| 447 |
)
|
| 448 |
+
|
| 449 |
+
# Conditionally show candidate text box
|
| 450 |
+
with gr.Group(visible=False) as candidate_group:
|
| 451 |
+
candidate_text = gr.Textbox(
|
| 452 |
+
label="Candidate Text",
|
| 453 |
+
lines=10,
|
| 454 |
+
placeholder="Enter candidate text to evaluate...",
|
| 455 |
+
elem_id="candidate-text"
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
# Update visibility of candidate text box based on input mode
|
| 459 |
+
def update_candidate_visibility(mode):
|
| 460 |
+
return gr.update(visible=(mode == "Both Reference and Candidate"))
|
| 461 |
+
|
| 462 |
+
input_mode.change(
|
| 463 |
+
fn=update_candidate_visibility,
|
| 464 |
+
inputs=input_mode,
|
| 465 |
+
outputs=candidate_group
|
| 466 |
)
|
| 467 |
|
| 468 |
gr.Markdown("### βοΈ Configuration")
|
|
|
|
| 483 |
submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
|
| 484 |
|
| 485 |
with gr.Column(scale=2):
|
| 486 |
+
gr.Markdown("### π Text Comparison")
|
| 487 |
+
|
| 488 |
+
with gr.Tabs():
|
| 489 |
+
with gr.TabItem("Reference Text"):
|
| 490 |
+
reference_output = gr.Textbox(
|
| 491 |
+
label="Reference Text",
|
| 492 |
+
lines=8,
|
| 493 |
+
elem_id="reference-output"
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
with gr.TabItem("Candidate Text"):
|
| 497 |
+
candidate_output = gr.Textbox(
|
| 498 |
+
label="Candidate Text",
|
| 499 |
+
lines=8,
|
| 500 |
+
elem_id="candidate-output"
|
| 501 |
+
)
|
| 502 |
|
| 503 |
gr.Markdown("### π Evaluation Metrics")
|
| 504 |
metrics_output = gr.Dataframe(
|
|
|
|
| 512 |
label="Summary",
|
| 513 |
elem_id="summary-output"
|
| 514 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
|
| 516 |
# Event handlers
|
| 517 |
submit_btn.click(
|
| 518 |
fn=process_input,
|
| 519 |
+
inputs=[input_mode, reference_text, candidate_text, model_choice, prompt_choice],
|
| 520 |
+
outputs=[reference_output, candidate_output, metrics_output, summary_output]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
)
|
| 522 |
|
| 523 |
# Add interpretation guide in an accordion
|