Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -357,8 +357,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
|
|
| 357 |
"interpretation": interpretation
|
| 358 |
}
|
| 359 |
|
| 360 |
-
def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str) -> Tuple[str, List[List[str]], str]:
|
| 361 |
-
"""Process either input text or uploaded file"""
|
| 362 |
if input_text and file_upload:
|
| 363 |
return "Please use either text input or file upload, not both.", [], ""
|
| 364 |
|
|
@@ -382,147 +382,151 @@ def process_input(input_text: str, file_upload, model_choice: str, prompt_choice
|
|
| 382 |
|
| 383 |
# Process single text input
|
| 384 |
if input_text:
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
)
|
| 416 |
|
| 417 |
# Process file upload
|
| 418 |
if file_upload:
|
| 419 |
-
|
| 420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
time.sleep(0.2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
# Read the file (assuming CSV with one column of text)
|
| 424 |
try:
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
except Exception as e:
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
# Add to results
|
| 445 |
-
results.append(result["weighted_score"])
|
| 446 |
-
|
| 447 |
-
# Store detailed results
|
| 448 |
-
detailed_results.append({
|
| 449 |
-
"input_preview": text[:100] + "..." if len(text) > 100 else text,
|
| 450 |
-
"weighted_score": result["weighted_score"],
|
| 451 |
-
"interpretation": result["interpretation"],
|
| 452 |
-
"candidate": result["candidate"]
|
| 453 |
-
})
|
| 454 |
-
except Exception as e:
|
| 455 |
-
print(f"Error processing entry {i}: {str(e)}")
|
| 456 |
-
results.append(0.0)
|
| 457 |
-
detailed_results.append({
|
| 458 |
-
"input_preview": text[:100] + "..." if len(text) > 100 else text,
|
| 459 |
-
"weighted_score": 0.0,
|
| 460 |
-
"interpretation": "Error processing this entry",
|
| 461 |
-
"candidate": ""
|
| 462 |
-
})
|
| 463 |
|
| 464 |
-
|
| 465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
|
| 467 |
-
# Create
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
elif avg_score >= 0.50:
|
| 482 |
-
summary = "Adequate performance but needs refinement"
|
| 483 |
-
else:
|
| 484 |
-
summary = "Significant improvements needed"
|
| 485 |
-
|
| 486 |
-
# Format summary
|
| 487 |
-
summary_text = (
|
| 488 |
-
f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
|
| 489 |
-
f"Average Hybrid Score: {avg_score:.4f}\n"
|
| 490 |
-
f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
|
| 491 |
-
f"{summary}"
|
| 492 |
-
)
|
| 493 |
-
|
| 494 |
-
# Create metrics table for summary
|
| 495 |
-
metrics_table = [
|
| 496 |
-
["Metric", "Value"],
|
| 497 |
-
["Entries Processed", f"{len(results)}"],
|
| 498 |
-
["Successful Entries", f"{len(valid_scores)}"],
|
| 499 |
-
["Average Score", f"{avg_score:.4f}"],
|
| 500 |
-
["Best Score", f"{max_score:.4f}"],
|
| 501 |
-
["Worst Score", f"{min_score:.4f}"],
|
| 502 |
-
["Overall Assessment", summary]
|
| 503 |
-
]
|
| 504 |
-
|
| 505 |
-
return (
|
| 506 |
-
"Batch processing complete. Use the 'Show Details' button to see individual results.",
|
| 507 |
-
metrics_table,
|
| 508 |
-
summary_text
|
| 509 |
-
)
|
| 510 |
-
else:
|
| 511 |
-
return (
|
| 512 |
-
"No successful evaluations. Check your API configuration and input data.",
|
| 513 |
-
[["Error", "All evaluations failed"]],
|
| 514 |
-
"Error: No successful evaluations. Check your API configuration and input data."
|
| 515 |
-
)
|
| 516 |
|
| 517 |
-
def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
|
| 518 |
"""Show detailed results for batch processing"""
|
| 519 |
if not file_upload:
|
| 520 |
return "No file uploaded for batch processing."
|
| 521 |
|
|
|
|
|
|
|
|
|
|
| 522 |
# Read the file
|
| 523 |
df = pd.read_csv(file_upload.name)
|
| 524 |
text_column = df.columns[0]
|
| 525 |
|
|
|
|
|
|
|
|
|
|
| 526 |
# Determine model provider
|
| 527 |
if model_choice == "Gemini":
|
| 528 |
model_provider = GeminiProvider("gemini-1.5-flash-latest")
|
|
@@ -531,12 +535,19 @@ def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
|
|
| 531 |
else: # Llama-3-8b
|
| 532 |
model_provider = GroqProvider("llama3-8b-8192")
|
| 533 |
|
|
|
|
|
|
|
|
|
|
| 534 |
# Get prompt template
|
| 535 |
prompt_template = PROMPT_TEMPLATES[prompt_choice]
|
| 536 |
|
|
|
|
|
|
|
|
|
|
| 537 |
# Process each entry
|
| 538 |
results = []
|
| 539 |
-
for
|
|
|
|
| 540 |
text = str(row[text_column])
|
| 541 |
try:
|
| 542 |
result = evaluate_text(text, model_provider, prompt_template)
|
|
@@ -553,7 +564,9 @@ def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
|
|
| 553 |
"Interpretation": "Processing error",
|
| 554 |
"Candidate Text": ""
|
| 555 |
})
|
|
|
|
| 556 |
|
|
|
|
| 557 |
return gr.Dataframe(value=pd.DataFrame(results))
|
| 558 |
|
| 559 |
# Create Gradio interface
|
|
|
|
| 357 |
"interpretation": interpretation
|
| 358 |
}
|
| 359 |
|
| 360 |
+
def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, List[List[str]], str]:
|
| 361 |
+
"""Process either input text or uploaded file with progress tracking"""
|
| 362 |
if input_text and file_upload:
|
| 363 |
return "Please use either text input or file upload, not both.", [], ""
|
| 364 |
|
|
|
|
| 382 |
|
| 383 |
# Process single text input
|
| 384 |
if input_text:
|
| 385 |
+
progress(0.1, desc="Starting evaluation...")
|
| 386 |
+
time.sleep(0.2)
|
| 387 |
+
|
| 388 |
+
progress(0.3, desc="Generating rewritten content...")
|
| 389 |
+
time.sleep(0.2)
|
| 390 |
+
|
| 391 |
+
progress(0.6, desc="Calculating metrics...")
|
| 392 |
+
result = evaluate_text(input_text, model_provider, prompt_template)
|
| 393 |
+
|
| 394 |
+
progress(0.9, desc="Finalizing results...")
|
| 395 |
+
time.sleep(0.2)
|
| 396 |
+
|
| 397 |
+
# Format metrics for display
|
| 398 |
+
metrics_table = [
|
| 399 |
+
["Metric", "Raw Score", "Normalized"],
|
| 400 |
+
["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
|
| 401 |
+
["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
|
| 402 |
+
["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
|
| 403 |
+
["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
|
| 404 |
+
["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
|
| 405 |
+
["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
|
| 406 |
+
["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
|
| 407 |
+
["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
|
| 408 |
+
]
|
| 409 |
+
|
| 410 |
+
return (
|
| 411 |
+
result["candidate"],
|
| 412 |
+
metrics_table,
|
| 413 |
+
f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
|
| 414 |
+
)
|
|
|
|
| 415 |
|
| 416 |
# Process file upload
|
| 417 |
if file_upload:
|
| 418 |
+
progress(0.1, desc="Reading file...")
|
| 419 |
+
time.sleep(0.2)
|
| 420 |
+
|
| 421 |
+
# Read the file (assuming CSV with one column of text)
|
| 422 |
+
try:
|
| 423 |
+
df = pd.read_csv(file_upload.name)
|
| 424 |
+
progress(0.3, desc="Processing entries...")
|
| 425 |
time.sleep(0.2)
|
| 426 |
+
except Exception as e:
|
| 427 |
+
return f"Error reading file: {str(e)}", [], ""
|
| 428 |
+
|
| 429 |
+
# Assuming the first column contains the text
|
| 430 |
+
text_column = df.columns[0]
|
| 431 |
+
results = []
|
| 432 |
+
detailed_results = []
|
| 433 |
+
|
| 434 |
+
# Process each entry with progress updates
|
| 435 |
+
for i, row in df.iterrows():
|
| 436 |
+
progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
|
| 437 |
+
text = str(row[text_column])
|
| 438 |
|
|
|
|
| 439 |
try:
|
| 440 |
+
result = evaluate_text(text, model_provider, prompt_template)
|
| 441 |
+
|
| 442 |
+
# Add to results
|
| 443 |
+
results.append(result["weighted_score"])
|
| 444 |
+
|
| 445 |
+
# Store detailed results
|
| 446 |
+
detailed_results.append({
|
| 447 |
+
"input_preview": text[:100] + "..." if len(text) > 100 else text,
|
| 448 |
+
"weighted_score": result["weighted_score"],
|
| 449 |
+
"interpretation": result["interpretation"],
|
| 450 |
+
"candidate": result["candidate"]
|
| 451 |
+
})
|
| 452 |
except Exception as e:
|
| 453 |
+
print(f"Error processing entry {i}: {str(e)}")
|
| 454 |
+
results.append(0.0)
|
| 455 |
+
detailed_results.append({
|
| 456 |
+
"input_preview": text[:100] + "..." if len(text) > 100 else text,
|
| 457 |
+
"weighted_score": 0.0,
|
| 458 |
+
"interpretation": "Error processing this entry",
|
| 459 |
+
"candidate": ""
|
| 460 |
+
})
|
| 461 |
+
|
| 462 |
+
progress(0.9, desc="Generating summary...")
|
| 463 |
+
time.sleep(0.2)
|
| 464 |
+
|
| 465 |
+
# Create results dataframe
|
| 466 |
+
results_df = pd.DataFrame(detailed_results)
|
| 467 |
+
|
| 468 |
+
# Generate summary statistics
|
| 469 |
+
valid_scores = [s for s in results if s > 0]
|
| 470 |
+
if valid_scores:
|
| 471 |
+
avg_score = sum(valid_scores) / len(valid_scores)
|
| 472 |
+
min_score = min(valid_scores)
|
| 473 |
+
max_score = max(valid_scores)
|
| 474 |
|
| 475 |
+
if avg_score >= 0.85:
|
| 476 |
+
summary = "Excellent performance across inputs"
|
| 477 |
+
elif avg_score >= 0.70:
|
| 478 |
+
summary = "Good performance with room for minor improvements"
|
| 479 |
+
elif avg_score >= 0.50:
|
| 480 |
+
summary = "Adequate performance but needs refinement"
|
| 481 |
+
else:
|
| 482 |
+
summary = "Significant improvements needed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
+
# Format summary
|
| 485 |
+
summary_text = (
|
| 486 |
+
f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
|
| 487 |
+
f"Average Hybrid Score: {avg_score:.4f}\n"
|
| 488 |
+
f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
|
| 489 |
+
f"{summary}"
|
| 490 |
+
)
|
| 491 |
|
| 492 |
+
# Create metrics table for summary
|
| 493 |
+
metrics_table = [
|
| 494 |
+
["Metric", "Value"],
|
| 495 |
+
["Entries Processed", f"{len(results)}"],
|
| 496 |
+
["Successful Entries", f"{len(valid_scores)}"],
|
| 497 |
+
["Average Score", f"{avg_score:.4f}"],
|
| 498 |
+
["Best Score", f"{max_score:.4f}"],
|
| 499 |
+
["Worst Score", f"{min_score:.4f}"],
|
| 500 |
+
["Overall Assessment", summary]
|
| 501 |
+
]
|
| 502 |
|
| 503 |
+
return (
|
| 504 |
+
"Batch processing complete. Use the 'Show Details' button to see individual results.",
|
| 505 |
+
metrics_table,
|
| 506 |
+
summary_text
|
| 507 |
+
)
|
| 508 |
+
else:
|
| 509 |
+
return (
|
| 510 |
+
"No successful evaluations. Check your API configuration and input data.",
|
| 511 |
+
[["Error", "All evaluations failed"]],
|
| 512 |
+
"Error: No successful evaluations. Check your API configuration and input data."
|
| 513 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
+
def show_detailed_results(input_text, file_upload, model_choice, prompt_choice, progress=gr.Progress()):
|
| 516 |
"""Show detailed results for batch processing"""
|
| 517 |
if not file_upload:
|
| 518 |
return "No file uploaded for batch processing."
|
| 519 |
|
| 520 |
+
progress(0.1, desc="Reading file...")
|
| 521 |
+
time.sleep(0.1)
|
| 522 |
+
|
| 523 |
# Read the file
|
| 524 |
df = pd.read_csv(file_upload.name)
|
| 525 |
text_column = df.columns[0]
|
| 526 |
|
| 527 |
+
progress(0.3, desc="Determining model provider...")
|
| 528 |
+
time.sleep(0.1)
|
| 529 |
+
|
| 530 |
# Determine model provider
|
| 531 |
if model_choice == "Gemini":
|
| 532 |
model_provider = GeminiProvider("gemini-1.5-flash-latest")
|
|
|
|
| 535 |
else: # Llama-3-8b
|
| 536 |
model_provider = GroqProvider("llama3-8b-8192")
|
| 537 |
|
| 538 |
+
progress(0.5, desc="Getting prompt template...")
|
| 539 |
+
time.sleep(0.1)
|
| 540 |
+
|
| 541 |
# Get prompt template
|
| 542 |
prompt_template = PROMPT_TEMPLATES[prompt_choice]
|
| 543 |
|
| 544 |
+
progress(0.7, desc="Processing entries...")
|
| 545 |
+
time.sleep(0.1)
|
| 546 |
+
|
| 547 |
# Process each entry
|
| 548 |
results = []
|
| 549 |
+
for i, row in enumerate(df.iterrows()):
|
| 550 |
+
_, row = row # Unpack the tuple
|
| 551 |
text = str(row[text_column])
|
| 552 |
try:
|
| 553 |
result = evaluate_text(text, model_provider, prompt_template)
|
|
|
|
| 564 |
"Interpretation": "Processing error",
|
| 565 |
"Candidate Text": ""
|
| 566 |
})
|
| 567 |
+
progress(0.7 + (i + 1) / len(df) * 0.3, desc=f"Processing entry {i+1}/{len(df)}")
|
| 568 |
|
| 569 |
+
progress(1.0, desc="Completed!")
|
| 570 |
return gr.Dataframe(value=pd.DataFrame(results))
|
| 571 |
|
| 572 |
# Create Gradio interface
|