Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -403,17 +403,17 @@ def process_input(input_mode: str, reference_text: str, candidate_text: str, mod
|
|
| 403 |
time.sleep(0.1)
|
| 404 |
cleaned_candidate = clean_text(candidate_text)
|
| 405 |
|
| 406 |
-
# Format metrics for display
|
| 407 |
metrics_table = [
|
| 408 |
-
["Metric", "
|
| 409 |
-
["AnswerRelevancy", f"{result['
|
| 410 |
-
["Faithfulness", f"{result['
|
| 411 |
-
["GEval", f"{result['
|
| 412 |
-
["BERTScore", f"{result['
|
| 413 |
-
["ROUGE", f"{result['
|
| 414 |
-
["BLEU", f"{result['
|
| 415 |
-
["METEOR", f"{result['
|
| 416 |
-
["
|
| 417 |
]
|
| 418 |
|
| 419 |
return (
|
|
@@ -435,7 +435,7 @@ def load_example():
|
|
| 435 |
candidate_text # candidate_text
|
| 436 |
)
|
| 437 |
|
| 438 |
-
|
| 439 |
with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
|
| 440 |
gr.Markdown("# π LLM Evaluation Framework for Professional Content Rewriting")
|
| 441 |
gr.Markdown("Evaluate the quality of LLM-generated content using multiple metrics with proper normalization.")
|
|
@@ -571,10 +571,9 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
|
|
| 571 |
| **METEOR** | Linguistic quality with synonyms | How natural does the cleaned output read? |
|
| 572 |
""")
|
| 573 |
|
| 574 |
-
|
| 575 |
if __name__ == "__main__":
|
| 576 |
demo.launch(
|
| 577 |
server_name="0.0.0.0",
|
| 578 |
server_port=7860,
|
| 579 |
-
share=True
|
| 580 |
)
|
|
|
|
| 403 |
time.sleep(0.1)
|
| 404 |
cleaned_candidate = clean_text(candidate_text)
|
| 405 |
|
| 406 |
+
# Format metrics for display - ONLY SHOWING NORMALIZED SCORES AND HYBRID SCORE
|
| 407 |
metrics_table = [
|
| 408 |
+
["Metric", "Normalized Score"],
|
| 409 |
+
["AnswerRelevancy", f"{result['normalized']['AnswerRelevancy']:.4f}"],
|
| 410 |
+
["Faithfulness", f"{result['normalized']['Faithfulness']:.4f}"],
|
| 411 |
+
["GEval", f"{result['normalized']['GEval']:.4f}"],
|
| 412 |
+
["BERTScore", f"{result['normalized']['BERTScore']:.4f}"],
|
| 413 |
+
["ROUGE", f"{result['normalized']['ROUGE']:.4f}"],
|
| 414 |
+
["BLEU", f"{result['normalized']['BLEU']:.4f}"],
|
| 415 |
+
["METEOR", f"{result['normalized']['METEOR']:.4f}"],
|
| 416 |
+
["Hybrid Score", f"{result['weighted_score']:.4f}"]
|
| 417 |
]
|
| 418 |
|
| 419 |
return (
|
|
|
|
| 435 |
candidate_text # candidate_text
|
| 436 |
)
|
| 437 |
|
| 438 |
+
|
| 439 |
with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
|
| 440 |
gr.Markdown("# π LLM Evaluation Framework for Professional Content Rewriting")
|
| 441 |
gr.Markdown("Evaluate the quality of LLM-generated content using multiple metrics with proper normalization.")
|
|
|
|
| 571 |
| **METEOR** | Linguistic quality with synonyms | How natural does the cleaned output read? |
|
| 572 |
""")
|
| 573 |
|
| 574 |
+
|
| 575 |
if __name__ == "__main__":
|
| 576 |
demo.launch(
|
| 577 |
server_name="0.0.0.0",
|
| 578 |
server_port=7860,
|
|
|
|
| 579 |
)
|