Sa-m commited on
Commit
0ef5ef9
Β·
verified Β·
1 Parent(s): 7517d1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -249
app.py CHANGED
@@ -200,17 +200,11 @@ def calculate_weighted_score(scores: Dict[str, float]) -> float:
200
 
201
  return weighted_sum / total_weight if total_weight > 0 else 0
202
 
203
- def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template: str) -> Dict[str, Any]:
204
- """Evaluate a single text using the selected model and prompt"""
205
- # Create clean reference text
206
- reference_text = clean_text(raw_input)
207
-
208
- # Generate candidate using the selected model and prompt
209
- prompt = prompt_template.replace("{text}", raw_input)
210
- candidate = model_provider.generate(prompt)
211
-
212
- # Clean candidate output for consistent evaluation
213
- cleaned_candidate = clean_text(candidate)
214
 
215
  # Initialize evaluation metrics
216
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
@@ -223,7 +217,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
223
  smooth = SmoothingFunction().method4
224
  bleu = sentence_bleu(
225
  [reference_text.split()],
226
- cleaned_candidate.split(),
227
  smoothing_function=smooth
228
  )
229
  results["BLEU"] = bleu
@@ -233,7 +227,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
233
 
234
  # ROUGE Score
235
  try:
236
- rouge_scores = scorer.score(reference_text, cleaned_candidate)
237
  rouge = (rouge_scores['rouge1'].fmeasure +
238
  rouge_scores['rouge2'].fmeasure +
239
  rouge_scores['rougeL'].fmeasure) / 3
@@ -246,7 +240,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
246
  try:
247
  meteor = meteor_score(
248
  [reference_text.split()],
249
- cleaned_candidate.split()
250
  )
251
  results["METEOR"] = meteor
252
  except Exception as e:
@@ -256,7 +250,7 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
256
  # BERTScore
257
  try:
258
  P, R, F1 = bert_score(
259
- [cleaned_candidate],
260
  [reference_text],
261
  lang="en",
262
  verbose=False
@@ -276,8 +270,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
276
  relevancy_prompt = f"""
277
  On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
278
 
279
- Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
280
- Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
281
 
282
  Provide only a single number between 0.0 and 1.0 with no explanation.
283
  """
@@ -292,8 +286,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
292
  faithfulness_prompt = f"""
293
  On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
294
 
295
- Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
296
- Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
297
 
298
  Provide only a single number between 0.0 and 1.0 with no explanation.
299
  """
@@ -309,8 +303,8 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
309
  On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
310
  Consider accuracy, completeness, fluency, and professionalism.
311
 
312
- Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
313
- Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
314
 
315
  Provide only a single number between 0.0 and 1.0 with no explanation.
316
  """
@@ -349,20 +343,17 @@ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template:
349
  interpretation = "Poor performance (F) - likely needs complete rewriting"
350
 
351
  return {
352
- "candidate": cleaned_candidate,
353
  "metrics": results,
354
  "normalized": normalized_scores,
355
  "weighted_score": weighted_score,
356
  "interpretation": interpretation
357
  }
358
 
359
- def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, List[List[str]], str]:
360
- """Process either input text or uploaded file with progress tracking"""
361
- if input_text and file_upload:
362
- return "Please use either text input or file upload, not both.", [], ""
363
-
364
- if not input_text and not file_upload:
365
- return "Please provide input text or upload a file.", [], ""
366
 
367
  # Determine model provider
368
  if model_choice == "Gemini":
@@ -374,199 +365,63 @@ def process_input(input_text: str, file_upload, model_choice: str, prompt_choice
374
 
375
  # Check if model is available
376
  if not model_provider.available:
377
- return f"Error: {model_choice} is not properly configured. Check your API key.", [], ""
378
-
379
- # Get prompt template
380
- prompt_template = PROMPT_TEMPLATES[prompt_choice]
381
 
382
- # Process single text input
383
- if input_text:
384
  progress(0.1, desc="Starting evaluation...")
385
- time.sleep(0.2)
386
-
387
- progress(0.3, desc="Generating rewritten content...")
388
- time.sleep(0.2)
389
-
390
- progress(0.6, desc="Calculating metrics...")
391
- result = evaluate_text(input_text, model_provider, prompt_template)
392
 
393
- progress(0.9, desc="Finalizing results...")
394
- time.sleep(0.2)
395
 
396
- # Format metrics for display
397
- metrics_table = [
398
- ["Metric", "Raw Score", "Normalized"],
399
- ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
400
- ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
401
- ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
402
- ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
403
- ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
404
- ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
405
- ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
406
- ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
407
- ]
408
 
409
- return (
410
- result["candidate"],
411
- metrics_table,
412
- f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
413
- )
414
-
415
- # Process file upload
416
- if file_upload:
417
- progress(0.1, desc="Reading file...")
418
- time.sleep(0.2)
419
 
420
- # Read the file (assuming CSV with one column of text)
421
- try:
422
- df = pd.read_csv(file_upload.name)
423
- progress(0.3, desc="Processing entries...")
424
- time.sleep(0.2)
425
- except Exception as e:
426
- return f"Error reading file: {str(e)}", [], ""
427
 
428
- # Assuming the first column contains the text
429
- text_column = df.columns[0]
430
- results = []
431
- detailed_results = []
432
 
433
- # Process each entry with progress updates
434
- for i, row in df.iterrows():
435
- progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
436
- text = str(row[text_column])
437
-
438
- try:
439
- result = evaluate_text(text, model_provider, prompt_template)
440
-
441
- # Add to results
442
- results.append(result["weighted_score"])
443
-
444
- # Store detailed results
445
- detailed_results.append({
446
- "input_preview": text[:100] + "..." if len(text) > 100 else text,
447
- "weighted_score": result["weighted_score"],
448
- "interpretation": result["interpretation"],
449
- "candidate": result["candidate"]
450
- })
451
- except Exception as e:
452
- print(f"Error processing entry {i}: {str(e)}")
453
- results.append(0.0)
454
- detailed_results.append({
455
- "input_preview": text[:100] + "..." if len(text) > 100 else text,
456
- "weighted_score": 0.0,
457
- "interpretation": "Error processing this entry",
458
- "candidate": ""
459
- })
460
 
461
- progress(0.9, desc="Generating summary...")
 
462
  time.sleep(0.2)
463
 
464
- # Create results dataframe
465
- results_df = pd.DataFrame(detailed_results)
466
 
467
- # Generate summary statistics
468
- valid_scores = [s for s in results if s > 0]
469
- if valid_scores:
470
- avg_score = sum(valid_scores) / len(valid_scores)
471
- min_score = min(valid_scores)
472
- max_score = max(valid_scores)
473
-
474
- if avg_score >= 0.85:
475
- summary = "Excellent performance across inputs"
476
- elif avg_score >= 0.70:
477
- summary = "Good performance with room for minor improvements"
478
- elif avg_score >= 0.50:
479
- summary = "Adequate performance but needs refinement"
480
- else:
481
- summary = "Significant improvements needed"
482
-
483
- # Format summary
484
- summary_text = (
485
- f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
486
- f"Average Hybrid Score: {avg_score:.4f}\n"
487
- f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
488
- f"{summary}"
489
- )
490
-
491
- # Create metrics table for summary
492
- metrics_table = [
493
- ["Metric", "Value"],
494
- ["Entries Processed", f"{len(results)}"],
495
- ["Successful Entries", f"{len(valid_scores)}"],
496
- ["Average Score", f"{avg_score:.4f}"],
497
- ["Best Score", f"{max_score:.4f}"],
498
- ["Worst Score", f"{min_score:.4f}"],
499
- ["Overall Assessment", summary]
500
- ]
501
-
502
- return (
503
- "Batch processing complete. Use the 'Show Details' button to see individual results.",
504
- metrics_table,
505
- summary_text
506
- )
507
- else:
508
- return (
509
- "No successful evaluations. Check your API configuration and input data.",
510
- [["Error", "All evaluations failed"]],
511
- "Error: No successful evaluations. Check your API configuration and input data."
512
- )
513
-
514
- def show_detailed_results(input_text, file_upload, model_choice, prompt_choice, progress=gr.Progress()):
515
- """Show detailed results for batch processing"""
516
- if not file_upload:
517
- return "No file uploaded for batch processing."
518
-
519
- progress(0.1, desc="Reading file...")
520
- time.sleep(0.1)
521
-
522
- # Read the file
523
- df = pd.read_csv(file_upload.name)
524
- text_column = df.columns[0]
525
-
526
- progress(0.3, desc="Determining model provider...")
527
- time.sleep(0.1)
528
-
529
- # Determine model provider
530
- if model_choice == "Gemini":
531
- model_provider = GeminiProvider("gemini-1.5-flash-latest")
532
- elif model_choice == "Llama-3-70b":
533
- model_provider = GroqProvider("llama3-70b-8192")
534
- else: # Llama-3-8b
535
- model_provider = GroqProvider("llama3-8b-8192")
536
-
537
- progress(0.5, desc="Getting prompt template...")
538
- time.sleep(0.1)
539
-
540
- # Get prompt template
541
- prompt_template = PROMPT_TEMPLATES[prompt_choice]
542
-
543
- progress(0.7, desc="Processing entries...")
544
- time.sleep(0.1)
545
-
546
- # Process each entry
547
- results = []
548
- for i, row in enumerate(df.iterrows()):
549
- _, row = row # Unpack the tuple
550
- text = str(row[text_column])
551
- try:
552
- result = evaluate_text(text, model_provider, prompt_template)
553
- results.append({
554
- "Input Preview": text[:100] + "..." if len(text) > 100 else text,
555
- "Weighted Score": f"{result['weighted_score']:.4f}",
556
- "Interpretation": result['interpretation'],
557
- "Candidate Text": result['candidate']
558
- })
559
- except:
560
- results.append({
561
- "Input Preview": text[:100] + "..." if len(text) > 100 else text,
562
- "Weighted Score": "Error",
563
- "Interpretation": "Processing error",
564
- "Candidate Text": ""
565
- })
566
- progress(0.7 + (i + 1) / len(df) * 0.3, desc=f"Processing entry {i+1}/{len(df)}")
567
-
568
- progress(1.0, desc="Completed!")
569
- return gr.Dataframe(value=pd.DataFrame(results))
570
 
571
  # Create Gradio interface
572
  with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
@@ -576,17 +431,38 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
576
  with gr.Row():
577
  with gr.Column(scale=1):
578
  gr.Markdown("### πŸ“₯ Input Options")
579
- input_text = gr.Textbox(
580
- label="Input Text",
 
 
 
 
 
 
 
 
581
  lines=10,
582
- placeholder="Enter text to evaluate...",
583
- elem_id="input-text"
584
  )
585
- gr.Markdown("or")
586
- file_upload = gr.File(
587
- label="Upload CSV file (single column of text)",
588
- file_types=[".csv", ".txt"],
589
- elem_id="file-upload"
 
 
 
 
 
 
 
 
 
 
 
 
 
590
  )
591
 
592
  gr.Markdown("### βš™οΈ Configuration")
@@ -607,12 +483,22 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
607
  submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
608
 
609
  with gr.Column(scale=2):
610
- gr.Markdown("### ✍️ Rewritten Content")
611
- candidate_output = gr.Textbox(
612
- label="Rewritten Content",
613
- lines=15,
614
- elem_id="candidate-output"
615
- )
 
 
 
 
 
 
 
 
 
 
616
 
617
  gr.Markdown("### πŸ“ˆ Evaluation Metrics")
618
  metrics_output = gr.Dataframe(
@@ -626,34 +512,12 @@ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo
626
  label="Summary",
627
  elem_id="summary-output"
628
  )
629
-
630
- detailed_results_btn = gr.Button("Show Detailed Results (Batch)", visible=False)
631
- detailed_results = gr.Dataframe(visible=False)
632
-
633
- # Update visibility of detailed results button
634
- def update_detailed_results_visibility(file_upload, summary):
635
- has_file = file_upload is not None
636
- has_batch_results = "Processed" in summary and "entries" in summary
637
- return gr.update(visible=has_file and has_batch_results)
638
 
639
  # Event handlers
640
  submit_btn.click(
641
  fn=process_input,
642
- inputs=[input_text, file_upload, model_choice, prompt_choice],
643
- outputs=[candidate_output, metrics_output, summary_output]
644
- ).then(
645
- fn=update_detailed_results_visibility,
646
- inputs=[file_upload, summary_output],
647
- outputs=detailed_results_btn
648
- )
649
-
650
- detailed_results_btn.click(
651
- fn=show_detailed_results,
652
- inputs=[input_text, file_upload, model_choice, prompt_choice],
653
- outputs=detailed_results
654
- ).then(
655
- fn=lambda: gr.update(visible=True),
656
- outputs=detailed_results
657
  )
658
 
659
  # Add interpretation guide in an accordion
 
200
 
201
  return weighted_sum / total_weight if total_weight > 0 else 0
202
 
203
+ def evaluate_text(reference_text: str, candidate_text: str) -> Dict[str, Any]:
204
+ """Evaluate a candidate text against a reference text"""
205
+ # Clean both texts for consistent evaluation
206
+ reference_text = clean_text(reference_text)
207
+ candidate_text = clean_text(candidate_text)
 
 
 
 
 
 
208
 
209
  # Initialize evaluation metrics
210
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
 
217
  smooth = SmoothingFunction().method4
218
  bleu = sentence_bleu(
219
  [reference_text.split()],
220
+ candidate_text.split(),
221
  smoothing_function=smooth
222
  )
223
  results["BLEU"] = bleu
 
227
 
228
  # ROUGE Score
229
  try:
230
+ rouge_scores = scorer.score(reference_text, candidate_text)
231
  rouge = (rouge_scores['rouge1'].fmeasure +
232
  rouge_scores['rouge2'].fmeasure +
233
  rouge_scores['rougeL'].fmeasure) / 3
 
240
  try:
241
  meteor = meteor_score(
242
  [reference_text.split()],
243
+ candidate_text.split()
244
  )
245
  results["METEOR"] = meteor
246
  except Exception as e:
 
250
  # BERTScore
251
  try:
252
  P, R, F1 = bert_score(
253
+ [candidate_text],
254
  [reference_text],
255
  lang="en",
256
  verbose=False
 
270
  relevancy_prompt = f"""
271
  On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
272
 
273
+ Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
274
+ Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
275
 
276
  Provide only a single number between 0.0 and 1.0 with no explanation.
277
  """
 
286
  faithfulness_prompt = f"""
287
  On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
288
 
289
+ Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
290
+ Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
291
 
292
  Provide only a single number between 0.0 and 1.0 with no explanation.
293
  """
 
303
  On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
304
  Consider accuracy, completeness, fluency, and professionalism.
305
 
306
+ Input: {reference_text[:500]}{'...' if len(reference_text) > 500 else ''}
307
+ Candidate: {candidate_text[:500]}{'...' if len(candidate_text) > 500 else ''}
308
 
309
  Provide only a single number between 0.0 and 1.0 with no explanation.
310
  """
 
343
  interpretation = "Poor performance (F) - likely needs complete rewriting"
344
 
345
  return {
346
+ "candidate": candidate_text,
347
  "metrics": results,
348
  "normalized": normalized_scores,
349
  "weighted_score": weighted_score,
350
  "interpretation": interpretation
351
  }
352
 
353
+ def process_input(input_mode: str, reference_text: str, candidate_text: str, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, str, List[List[str]], str]:
354
+ """Process input based on selected mode"""
355
+ if not reference_text:
356
+ return "", "", [], "Please provide reference text."
 
 
 
357
 
358
  # Determine model provider
359
  if model_choice == "Gemini":
 
365
 
366
  # Check if model is available
367
  if not model_provider.available:
368
+ return "", "", [], f"Error: {model_choice} is not properly configured. Check your API key."
 
 
 
369
 
370
+ # Process based on input mode
371
+ if input_mode == "Reference Only (Generate Candidate)":
372
  progress(0.1, desc="Starting evaluation...")
373
+ time.sleep(0.1)
 
 
 
 
 
 
374
 
375
+ progress(0.3, desc="Generating rewritten content using prompt...")
376
+ time.sleep(0.1)
377
 
378
+ # Get prompt template
379
+ prompt_template = PROMPT_TEMPLATES[prompt_choice]
 
 
 
 
 
 
 
 
 
 
380
 
381
+ # Generate candidate using the selected model and prompt
382
+ prompt = prompt_template.replace("{text}", reference_text)
383
+ candidate = model_provider.generate(prompt)
384
+ cleaned_candidate = clean_text(candidate)
 
 
 
 
 
 
385
 
386
+ progress(0.6, desc="Calculating metrics...")
387
+ time.sleep(0.1)
 
 
 
 
 
388
 
389
+ # Evaluate the generated candidate
390
+ result = evaluate_text(reference_text, cleaned_candidate)
 
 
391
 
392
+ progress(0.9, desc="Finalizing results...")
393
+ time.sleep(0.1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
+ else: # "Both Reference and Candidate"
396
+ progress(0.3, desc="Calculating metrics...")
397
  time.sleep(0.2)
398
 
399
+ # Evaluate the provided candidate
400
+ result = evaluate_text(reference_text, candidate_text)
401
 
402
+ progress(0.8, desc="Finalizing results...")
403
+ time.sleep(0.1)
404
+ cleaned_candidate = clean_text(candidate_text)
405
+
406
+ # Format metrics for display
407
+ metrics_table = [
408
+ ["Metric", "Raw Score", "Normalized"],
409
+ ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
410
+ ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
411
+ ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
412
+ ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
413
+ ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
414
+ ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
415
+ ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
416
+ ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
417
+ ]
418
+
419
+ return (
420
+ reference_text,
421
+ result["candidate"],
422
+ metrics_table,
423
+ f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
424
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
  # Create Gradio interface
427
  with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
 
431
  with gr.Row():
432
  with gr.Column(scale=1):
433
  gr.Markdown("### πŸ“₯ Input Options")
434
+
435
+ input_mode = gr.Radio(
436
+ ["Reference Only (Generate Candidate)", "Both Reference and Candidate"],
437
+ label="Input Mode",
438
+ value="Reference Only (Generate Candidate)",
439
+ elem_id="input-mode"
440
+ )
441
+
442
+ reference_text = gr.Textbox(
443
+ label="Reference Text",
444
  lines=10,
445
+ placeholder="Enter reference text to evaluate against...",
446
+ elem_id="reference-text"
447
  )
448
+
449
+ # Conditionally show candidate text box
450
+ with gr.Group(visible=False) as candidate_group:
451
+ candidate_text = gr.Textbox(
452
+ label="Candidate Text",
453
+ lines=10,
454
+ placeholder="Enter candidate text to evaluate...",
455
+ elem_id="candidate-text"
456
+ )
457
+
458
+ # Update visibility of candidate text box based on input mode
459
+ def update_candidate_visibility(mode):
460
+ return gr.update(visible=(mode == "Both Reference and Candidate"))
461
+
462
+ input_mode.change(
463
+ fn=update_candidate_visibility,
464
+ inputs=input_mode,
465
+ outputs=candidate_group
466
  )
467
 
468
  gr.Markdown("### βš™οΈ Configuration")
 
483
  submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
484
 
485
  with gr.Column(scale=2):
486
+ gr.Markdown("### πŸ“„ Text Comparison")
487
+
488
+ with gr.Tabs():
489
+ with gr.TabItem("Reference Text"):
490
+ reference_output = gr.Textbox(
491
+ label="Reference Text",
492
+ lines=8,
493
+ elem_id="reference-output"
494
+ )
495
+
496
+ with gr.TabItem("Candidate Text"):
497
+ candidate_output = gr.Textbox(
498
+ label="Candidate Text",
499
+ lines=8,
500
+ elem_id="candidate-output"
501
+ )
502
 
503
  gr.Markdown("### πŸ“ˆ Evaluation Metrics")
504
  metrics_output = gr.Dataframe(
 
512
  label="Summary",
513
  elem_id="summary-output"
514
  )
 
 
 
 
 
 
 
 
 
515
 
516
  # Event handlers
517
  submit_btn.click(
518
  fn=process_input,
519
+ inputs=[input_mode, reference_text, candidate_text, model_choice, prompt_choice],
520
+ outputs=[reference_output, candidate_output, metrics_output, summary_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  )
522
 
523
  # Add interpretation guide in an accordion