Sa-m commited on
Commit
3323e08
Β·
verified Β·
1 Parent(s): 96b1e9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +308 -578
app.py CHANGED
@@ -3,50 +3,48 @@ import pandas as pd
3
  import numpy as np
4
  import re
5
  import unicodedata
6
- from typing import Dict, Tuple, List
7
  import ftfy
8
  import nltk
9
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
10
  from rouge_score import rouge_scorer
 
11
  from nltk.translate.meteor_score import meteor_score
12
- from bert_score import score as bert_score
13
  from deepeval.test_case import LLMTestCase
14
  from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, GEval
15
  from deepeval.models import DeepEvalBaseLLM
16
  import google.generativeai as genai
17
- import tempfile
18
  import os
19
- from pathlib import Path
20
- import logging
21
 
22
  # Download required NLTK data
23
  nltk.download('punkt', quiet=True)
24
  nltk.download('wordnet', quiet=True)
25
 
26
- # Configure logging
27
- logging.basicConfig(level=logging.INFO)
28
- logger = logging.getLogger(__name__)
29
 
30
- # Global variables for API keys (in production, use environment variables)
31
- GEMINI_API_KEY = None # Will be set from user input
32
- CONFIDENT_API_KEY = None # Will be set from user input
33
 
34
  class LLMProvider:
35
  """Abstract base class for LLM providers"""
36
- def __init__(self, model):
37
- self.model = model
38
 
39
  def generate(self, prompt: str) -> str:
40
  raise NotImplementedError
41
 
42
  def get_model_name(self) -> str:
43
- raise NotImplementedError
44
 
45
  class GeminiProvider(LLMProvider):
46
  """Gemini implementation"""
47
- def __init__(self, model_name="gemini-1.5-flash"):
48
- self.model_name = model_name
49
- genai.configure(api_key=GEMINI_API_KEY)
50
  self.model = genai.GenerativeModel(model_name)
51
 
52
  def generate(self, prompt: str) -> str:
@@ -54,245 +52,140 @@ class GeminiProvider(LLMProvider):
54
  response = self.model.generate_content(prompt)
55
  return response.text.strip()
56
  except Exception as e:
57
- logger.error(f"Error generating content with Gemini: {e}")
58
- return f"Error: {str(e)}"
59
-
60
- def get_model_name(self) -> str:
61
- return self.model_name
62
 
63
  class GroqProvider(LLMProvider):
64
- """Placeholder for Groq implementation"""
65
- def __init__(self, model_name="llama3-70b-8192"):
66
- self.model_name = model_name
67
- # Implementation would go here
68
- pass
69
 
70
  def generate(self, prompt: str) -> str:
71
- return "Groq implementation not available"
72
-
73
- def get_model_name(self) -> str:
74
- return self.model_name
 
 
 
 
 
 
 
 
75
 
76
- class GeminiLLM(DeepEvalBaseLLM):
77
- """Wrapper for Gemini to work with DeepEval"""
78
- def __init__(self, model):
79
- self.model = model
80
 
81
  def load_model(self):
82
- return self.model
83
 
84
  def generate(self, prompt: str) -> str:
85
- return self.model.generate_content(prompt).text.strip()
86
-
87
- async def a_generate(self, prompt: str) -> str:
88
- return self.model.generate_content(prompt).text.strip()
89
 
90
  def get_model_name(self) -> str:
91
- return "gemini-pro"
92
 
93
  def clean_text(text: str) -> str:
94
- """
95
- Clean text by fixing encoding artifacts and normalizing characters.
96
-
97
- Args:
98
- text (str): Input text to clean
99
-
100
- Returns:
101
- str: Cleaned text
102
- """
103
  if not text or not isinstance(text, str):
104
  return ""
105
-
106
- # Fix common encoding artifacts
107
  text = ftfy.fix_text(text)
108
  text = unicodedata.normalize('NFKD', text)
109
 
110
- # Replace smart quotes with standard ASCII quotes
111
- text = text.replace('β€œ', '"').replace('”', '"')
112
- text = text.replace("β€˜", "'").replace("’", "'")
 
113
 
114
- # Remove non-ASCII characters (optional, can be toggled)
115
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
116
 
117
  # Normalize whitespace
118
  text = ' '.join(text.split())
119
 
120
- return text
121
 
122
- def create_prompts() -> Dict[str, str]:
123
- """
124
- Create different prompt variants for testing.
125
 
126
- Returns:
127
- Dict[str, str]: Dictionary of prompt names and their text
128
- """
129
- prompts = {
130
- "Strategic Narrative Architect": """Role: Strategic Narrative Architect
131
- You are a professional content writer who transforms raw text into engaging, well-structured narratives.
132
- Your goal is to rewrite the following text while preserving all key facts and statistics, but enhancing:
133
- - Structure and flow
134
- - Engagement and readability
135
- - Professional tone
136
- - Strategic storytelling
137
-
138
- Guidelines:
139
- 1. Maintain all factual information and numerical data
140
- 2. Improve sentence structure for better readability
141
- 3. Use active voice where appropriate
142
- 4. Ensure professional tone suitable for publication
143
- 5. Add logical transitions between ideas
144
- 6. Keep the length similar to the original
145
-
146
- Rewrite the following text:
147
- {input_text}""",
148
-
149
- "Precision Storyteller": """Role: Precision Storyteller
150
- You are a meticulous editor who ensures factual accuracy and clarity in all content.
151
- Your goal is to rewrite the following text with maximum precision while maintaining:
152
- - Factual accuracy above all
153
- - Clarity and conciseness
154
- - Proper grammar and punctuation
155
- - Consistent terminology
156
-
157
- Guidelines:
158
- 1. Preserve every fact, statistic, and detail from the original
159
- 2. Correct any grammatical errors or awkward phrasing
160
- 3. Use precise, unambiguous language
161
- 4. Avoid embellishment or subjective interpretation
162
- 5. Maintain neutral, professional tone
163
- 6. Ensure all claims are supported by the original text
164
-
165
- Rewrite the following text:
166
- {input_text}"""
167
- }
168
 
169
- return prompts
170
-
171
- def evaluate_text(input_text: str, candidate_text: str, reference_text: str,
172
- judge_model) -> Dict[str, float]:
173
- """
174
- Evaluate the quality of a rewritten text using multiple metrics.
175
-
176
- Args:
177
- input_text (str): Original raw input text
178
- candidate_text (str): Generated candidate text
179
- reference_text (str): Cleaned reference text
180
- judge_model: Model for LLM-as-judge metrics
181
-
182
- Returns:
183
- Dict[str, float]: Dictionary of metric scores
184
- """
185
  results = {}
186
 
 
187
  try:
188
- # Initialize scorers
189
- bleu_scorer = SmoothingFunction().method4
190
- rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
191
-
192
- # Tokenize for BLEU and METEOR
193
- reference_tokens = reference_text.split()
194
- candidate_tokens = candidate_text.split()
195
-
196
  # BLEU Score
197
- try:
198
- bleu_score_val = sentence_bleu([reference_tokens], candidate_tokens,
199
- smoothing_function=bleu_scorer)
200
- results["BLEU"] = bleu_score_val
201
- except Exception as e:
202
- logger.warning(f"BLEU calculation failed: {e}")
203
- results["BLEU"] = 0.0
204
-
205
  # ROUGE Score
206
- try:
207
- rouge_scores = rouge_scorer_obj.score(reference_text, candidate_text)
208
- # Average of ROUGE-1, ROUGE-2, and ROUGE-L F1 scores
209
- rouge_avg = (rouge_scores['rouge1'].fmeasure +
210
- rouge_scores['rouge2'].fmeasure +
211
- rouge_scores['rougeL'].fmeasure) / 3
212
- results["ROUGE"] = rouge_avg
213
- except Exception as e:
214
- logger.warning(f"ROUGE calculation failed: {e}")
215
- results["ROUGE"] = 0.0
216
-
217
  # METEOR Score
218
- try:
219
- meteor_score_val = meteor_score([reference_tokens], candidate_tokens)
220
- results["METEOR"] = meteor_score_val
221
- except Exception as e:
222
- logger.warning(f"METEOR calculation failed: {e}")
223
- results["METEOR"] = 0.0
224
-
225
- # BERTScore
226
- try:
227
- P, R, F1 = bert_score([candidate_text], [reference_text], lang="en", verbose=False)
228
- results["BERTScore"] = F1.item()
229
- except Exception as e:
230
- logger.warning(f"BERTScore calculation failed: {e}")
231
- results["BERTScore"] = 0.0
232
-
233
- # LLM-as-judge metrics
234
- try:
235
- test_case = LLMTestCase(
236
- input=input_text,
237
- actual_output=candidate_text,
238
- expected_output=reference_text,
239
- retrieval_context=[reference_text]
240
- )
241
-
242
- # Answer Relevancy
243
- answer_rel = AnswerRelevancyMetric(model=judge_model)
244
- answer_rel.measure(test_case)
245
- results["AnswerRelevancy"] = answer_rel.score
246
-
247
- # Faithfulness
248
- faith = FaithfulnessMetric(model=judge_model)
249
- faith.measure(test_case)
250
- results["Faithfulness"] = faith.score
251
-
252
- # GEval
253
- geval = GEval(
254
- name="OverallQuality",
255
- criteria="Evaluate if the candidate response is accurate, complete, and well-written.",
256
- evaluation_params=[
257
- "input",
258
- "actual_output",
259
- "expected_output"
260
- ],
261
- model=judge_model,
262
- strict_mode=False
263
- )
264
- geval.measure(test_case)
265
- results["GEval"] = geval.score
266
-
267
- except Exception as e:
268
- logger.warning(f"LLM-as-judge metrics failed: {e}")
269
- # Set default values if LLM-as-judge fails
270
- results["AnswerRelevancy"] = 0.5
271
- results["Faithfulness"] = 0.5
272
- results["GEval"] = 0.5
273
-
274
  except Exception as e:
275
- logger.error(f"Error in evaluation: {e}")
276
- # Return default scores if everything fails
277
- default_metrics = ["BLEU", "ROUGE", "METEOR", "BERTScore",
278
- "AnswerRelevancy", "Faithfulness", "GEval"]
279
- for metric in default_metrics:
280
- results[metric] = 0.0
281
-
282
- return results
283
-
284
- def normalize_score(metric: str, value: float) -> float:
285
- """
286
- Normalize score to 0-1 scale based on metric's natural range.
287
 
288
- Args:
289
- metric (str): Name of the metric
290
- value (float): Raw score value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
- Returns:
293
- float: Normalized score between 0 and 1
294
- """
295
- # Define natural ranges for each metric
296
  normalization_ranges = {
297
  "AnswerRelevancy": (0.0, 1.0),
298
  "Faithfulness": (0.0, 1.0),
@@ -303,30 +196,6 @@ def normalize_score(metric: str, value: float) -> float:
303
  "METEOR": (0.0, 0.6)
304
  }
305
 
306
- if metric not in normalization_ranges or not isinstance(value, (int, float)):
307
- return value
308
-
309
- min_val, max_val = normalization_ranges[metric]
310
-
311
- # Handle edge cases
312
- if max_val <= min_val:
313
- return 0.5 # Default middle value if range is invalid
314
-
315
- # Normalize and clamp to [0,1]
316
- normalized = (value - min_val) / (max_val - min_val)
317
- return max(0.0, min(normalized, 1.0))
318
-
319
- def calculate_weighted_score(scores: Dict[str, float]) -> float:
320
- """
321
- Calculate weighted average of normalized scores.
322
-
323
- Args:
324
- scores (Dict[str, float]): Dictionary of metric scores
325
-
326
- Returns:
327
- float: Weighted average score
328
- """
329
- # Define weights for each metric
330
  weights = {
331
  "AnswerRelevancy": 0.10,
332
  "Faithfulness": 0.10,
@@ -337,374 +206,235 @@ def calculate_weighted_score(scores: Dict[str, float]) -> float:
337
  "METEOR": 0.15
338
  }
339
 
340
- normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()}
341
- total_weight = 0
342
- weighted_sum = 0
343
-
344
- for metric, weight in weights.items():
345
- if metric in normalized_scores:
346
- weighted_sum += normalized_scores[metric] * weight
347
- total_weight += weight
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
- return weighted_sum / total_weight if total_weight > 0 else 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
- def process_single_text(input_text: str, gemini_api_key: str,
352
- confident_api_key: str, progress=gr.Progress()) -> Tuple[Dict, List[Dict]]:
353
- """
354
- Process a single text input and return evaluation results.
355
 
356
- Args:
357
- input_text (str): Input text to evaluate
358
- gemini_api_key (str): Gemini API key
359
- confident_api_key (str): Confident API key for DeepEval
360
- progress: Gradio progress tracker
361
-
362
- Returns:
363
- Tuple[Dict, List[Dict]]: Summary results and detailed results for each prompt
364
- """
365
- global GEMINI_API_KEY, CONFIDENT_API_KEY
366
 
367
- # Set API keys
368
- GEMINI_API_KEY = gemini_api_key
369
- CONFIDENT_API_KEY = confident_api_key
370
 
371
- if not input_text or not input_text.strip():
372
- return {"error": "Please provide valid input text"}, []
373
 
374
- try:
375
- # Clean the input text to create reference
376
- progress(0.1, "Cleaning input text...")
377
- reference_text = clean_text(input_text)
378
-
379
- if not reference_text:
380
- return {"error": "Could not process the input text"}, []
381
-
382
- # Initialize Gemini model
383
- progress(0.2, "Initializing Gemini model...")
384
- try:
385
- genai.configure(api_key=GEMINI_API_KEY)
386
- gemini_model = genai.GenerativeModel("gemini-1.5-flash")
387
- judge = GeminiLLM(gemini_model)
388
- except Exception as e:
389
- return {"error": f"Failed to initialize Gemini: {str(e)}"}, []
390
-
391
- # Get prompts
392
- progress(0.3, "Generating candidate texts...")
393
- prompts = create_prompts()
394
-
395
- detailed_results = []
396
-
397
- # Process each prompt
398
- for prompt_name, prompt_template in prompts.items():
399
- progress(0.3 + 0.6 * (list(prompts.keys()).index(prompt_name) / len(prompts)),
400
- f"Processing {prompt_name}...")
401
-
402
- # Generate candidate
403
- full_prompt = prompt_template.format(input_text=input_text)
404
- candidate_text = gemini_model.generate_content(full_prompt).text.strip()
405
-
406
- # Clean candidate text
407
- cleaned_candidate = clean_text(candidate_text)
408
-
409
- # Evaluate
410
- scores = evaluate_text(input_text, cleaned_candidate, reference_text, judge)
411
-
412
- # Calculate hybrid scores
413
- hybrid_avg = np.mean(list(scores.values()))
414
- weighted_avg = calculate_weighted_score(scores)
415
-
416
- # Add interpretation
417
- if weighted_avg >= 0.85:
418
- interpretation = "Outstanding performance (A) - ready for professional use"
419
- elif weighted_avg >= 0.70:
420
- interpretation = "Strong performance (B) - good quality with minor improvements"
421
- elif weighted_avg >= 0.50:
422
- interpretation = "Adequate performance (C) - usable but needs refinement"
423
- elif weighted_avg >= 0.30:
424
- interpretation = "Weak performance (D) - requires significant revision"
425
- else:
426
- interpretation = "Poor performance (F) - likely needs complete rewriting"
427
-
428
- detailed_results.append({
429
- "Prompt": prompt_name,
430
- "Original Input": input_text[:500] + "..." if len(input_text) > 500 else input_text,
431
- "Reference Text": reference_text[:500] + "..." if len(reference_text) > 500 else reference_text,
432
- "Candidate Text": cleaned_candidate,
433
- "Scores": scores,
434
- "Hybrid Average": hybrid_avg,
435
- "Weighted Average": weighted_avg,
436
- "Interpretation": interpretation
437
- })
438
-
439
- # Create summary
440
- summary = {
441
- "Total Prompts Evaluated": len(detailed_results),
442
- "Best Performing Prompt": max(detailed_results, key=lambda x: x["Weighted Average"])["Prompt"],
443
- "Highest Weighted Score": max(detailed_results, key=lambda x: x["Weighted Average"])["Weighted Average"],
444
- "Lowest Weighted Score": min(detailed_results, key=lambda x: x["Weighted Average"])["Weighted Average"]
445
- }
446
-
447
- progress(1.0, "Processing complete!")
448
- return summary, detailed_results
449
-
450
- except Exception as e:
451
- logger.error(f"Error processing text: {e}")
452
- return {"error": f"Processing failed: {str(e)}"}, []
453
 
454
- def process_uploaded_file(file_path: str, gemini_api_key: str,
455
- confident_api_key: str, progress=gr.Progress()) -> Tuple[Dict, List[Dict]]:
456
- """
457
- Process an uploaded CSV/Excel file containing texts to evaluate.
458
-
459
- Args:
460
- file_path (str): Path to uploaded file
461
- gemini_api_key (str): Gemini API key
462
- confident_api_key (str): Confident API key for DeepEval
463
- progress: Gradio progress tracker
464
-
465
- Returns:
466
- Tuple[Dict, List[Dict]]: Summary results and detailed results
467
- """
468
  try:
469
- # Read file based on extension
470
- file_ext = Path(file_path).suffix.lower()
 
471
 
472
- if file_ext in ['.csv']:
473
- df = pd.read_csv(file_path)
474
- elif file_ext in ['.xls', '.xlsx']:
475
- df = pd.read_excel(file_path)
476
- else:
477
- return {"error": "Unsupported file format. Please upload CSV or Excel file."}, []
478
 
479
- if df.empty:
480
- return {"error": "File is empty"}, []
481
-
482
- # Look for text column (case-insensitive)
483
- text_column = None
484
- for col in df.columns:
485
- if 'text' in col.lower() or 'content' in col.lower() or 'article' in col.lower():
486
- text_column = col
487
- break
488
-
489
- if not text_column:
490
- # Use first column if no text-like column found
491
- text_column = df.columns[0]
492
-
493
- texts = df[text_column].dropna().astype(str).tolist()
494
-
495
- if not texts:
496
- return {"error": "No valid text data found in the file"}, []
497
-
498
- all_results = []
499
- summaries = []
500
-
501
- # Process each text
502
- for i, text in enumerate(texts):
503
- progress(i / len(texts), f"Processing text {i+1} of {len(texts)}...")
504
- summary, details = process_single_text(text, gemini_api_key, confident_api_key)
505
- if "error" not in summary:
506
- summaries.append(summary)
507
- all_results.extend(details)
508
 
509
- if not all_results:
510
- return {"error": "Failed to process any texts"}, []
511
-
512
- # Create overall summary
513
- overall_summary = {
514
- "Total Files Processed": len(texts),
515
- "Total Prompts Evaluated": len(all_results),
516
- "Average Weighted Score": np.mean([r["Weighted Average"] for r in all_results]),
517
- "Best Performing Prompt": pd.DataFrame(all_results)["Prompt"].mode()[0]
518
- if len(all_results) > 0 else "N/A"
519
- }
520
 
521
- progress(1.0, "Batch processing complete!")
522
- return overall_summary, all_results
523
 
524
  except Exception as e:
525
- logger.error(f"Error processing file: {e}")
526
- return {"error": f"File processing failed: {str(e)}"}, []
527
 
528
  def create_gradio_interface():
529
- """Create the Gradio interface."""
530
 
531
  with gr.Blocks(title="LLM Evaluation Framework") as demo:
532
- gr.Markdown("# πŸ“Š LLM Evaluation Framework for Content Rewriting")
533
- gr.Markdown("Evaluate and compare different prompts for professional content rewriting tasks.")
534
 
535
  with gr.Tabs():
536
- with gr.Tab("Single Text Evaluation"):
537
- gr.Markdown("### Evaluate a single piece of text")
538
-
539
  with gr.Row():
540
  with gr.Column(scale=2):
541
  input_text = gr.Textbox(
542
- label="Input Text",
543
- placeholder="Paste your text here...",
544
  lines=10
545
  )
546
-
547
- with gr.Column(scale=1):
548
- gemini_api_key = gr.Textbox(
549
- label="Gemini API Key",
550
- placeholder="Enter your Gemini API key",
551
- type="password"
552
  )
553
- confident_api_key = gr.Textbox(
554
- label="Confident API Key (for DeepEval)",
555
- placeholder="Enter your Confident API key",
556
- type="password"
557
- )
558
- evaluate_btn = gr.Button("Evaluate Text", variant="primary")
559
-
560
- gr.Markdown("### Results")
561
- with gr.Row():
562
- with gr.Column():
563
- summary_output = gr.JSON(label="Summary Results")
564
-
565
- with gr.Column():
566
- detailed_output = gr.Dataframe(
567
- label="Detailed Results",
568
- headers=["Prompt", "Weighted Average", "Interpretation"],
569
- datatype=["str", "number", "str"]
570
- )
571
-
572
- # Hidden outputs for detailed data
573
- hidden_detailed_results = gr.State()
574
-
575
- def update_outputs(text, gemini_key, confident_key):
576
- if not text.strip():
577
- return {"error": "Please enter text"}, None, None
578
 
579
- summary, detailed = process_single_text(text, gemini_key, confident_key)
580
-
581
- if "error" in summary:
582
- return summary, None, None
583
-
584
- # Prepare dataframe data
585
- df_data = []
586
- for result in detailed:
587
- df_data.append([
588
- result["Prompt"],
589
- round(result["Weighted Average"], 3),
590
- result["Interpretation"]
591
- ])
592
-
593
- return summary, df_data, detailed
594
-
595
- evaluate_btn.click(
596
- fn=update_outputs,
597
- inputs=[input_text, gemini_api_key, confident_api_key],
598
- outputs=[summary_output, detailed_output, hidden_detailed_results]
599
- )
600
-
601
- # Button to show full candidate texts
602
- with gr.Row():
603
- show_details_btn = gr.Button("Show Full Results with Candidate Texts")
604
-
605
- full_results_output = gr.JSON(label="Full Detailed Results", visible=False)
606
-
607
- def show_full_results(detailed_results):
608
- if detailed_results is None:
609
- return {"error": "No results to display"}
610
- return detailed_results
611
-
612
- show_details_btn.click(
613
- fn=show_full_results,
614
- inputs=[hidden_detailed_results],
615
- outputs=[full_results_output]
616
- )
617
-
618
- with gr.Tab("Batch File Evaluation"):
619
- gr.Markdown("### Evaluate multiple texts from a file")
620
-
621
  with gr.Row():
622
- with gr.Column():
623
  file_input = gr.File(
624
- label="Upload CSV or Excel file",
625
- file_types=['.csv', '.xls', '.xlsx']
626
  )
627
-
628
- with gr.Column():
629
- batch_gemini_key = gr.Textbox(
630
- label="Gemini API Key",
631
- placeholder="Enter your Gemini API key",
632
- type="password"
633
  )
634
- batch_confident_key = gr.Textbox(
635
- label="Confident API Key (for DeepEval)",
636
- placeholder="Enter your Confident API key",
637
- type="password"
638
- )
639
- batch_evaluate_btn = gr.Button("Process File", variant="primary")
640
-
641
- gr.Markdown("### Batch Results")
642
- batch_summary_output = gr.JSON(label="Batch Summary Results")
643
- batch_detailed_output = gr.Dataframe(
644
- label="Detailed Results",
645
- headers=["Prompt", "Weighted Average", "Interpretation"],
646
- datatype=["str", "number", "str"]
647
- )
648
-
649
- # Hidden state for batch results
650
- hidden_batch_results = gr.State()
651
-
652
- def process_file(file, gemini_key, confident_key):
653
- if file is None:
654
- return {"error": "Please upload a file"}, None, None
655
-
656
- summary, detailed = process_uploaded_file(file.name, gemini_key, confident_key)
657
 
658
- if "error" in summary:
659
- return summary, None, None
660
-
661
- # Prepare dataframe data
662
- df_data = []
663
- for result in detailed:
664
- df_data.append([
665
- result["Prompt"],
666
- round(result["Weighted Average"], 3),
667
- result["Interpretation"]
668
- ])
669
-
670
- return summary, df_data, detailed
671
-
672
- batch_evaluate_btn.click(
673
- fn=process_file,
674
- inputs=[file_input, batch_gemini_key, batch_confident_key],
675
- outputs=[batch_summary_output, batch_detailed_output, hidden_batch_results]
676
- )
677
-
678
- # Button to show full batch results
679
- show_batch_details_btn = gr.Button("Show Full Batch Results")
680
- batch_full_results_output = gr.JSON(label="Full Batch Results", visible=False)
681
-
682
- show_batch_details_btn.click(
683
- fn=show_full_results,
684
- inputs=[hidden_batch_results],
685
- outputs=[batch_full_results_output]
686
- )
687
-
688
- gr.Markdown("""
689
- ## How to Use
 
 
 
 
 
 
 
 
 
690
 
691
- 1. **Single Text Evaluation**:
692
- - Enter your text in the input box
693
- - Provide your API keys
694
- - Click "Evaluate Text" to see results
695
 
696
- 2. **Batch File Evaluation**:
697
- - Upload a CSV or Excel file with a column containing text
698
- - Provide your API keys
699
- - Click "Process File" to evaluate all texts
 
700
 
701
- ### API Keys
702
- - **Gemini API Key**: Get from Google AI Studio
703
- - **Confident API Key**: Get from DeepEval dashboard
 
 
704
 
705
- ### Interpreting Results
706
- - **Weighted Average**: Our primary metric combining all evaluations
707
- - **Interpretation**: Performance grade based on weighted score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  """)
709
 
710
  return demo
@@ -712,4 +442,4 @@ def create_gradio_interface():
712
  # Launch the app
713
  if __name__ == "__main__":
714
  app = create_gradio_interface()
715
- app.launch(debug=True)
 
3
  import numpy as np
4
  import re
5
  import unicodedata
6
+ from typing import Dict, List, Tuple
7
  import ftfy
8
  import nltk
9
+ from bert_score import score as bert_score
10
  from rouge_score import rouge_scorer
11
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
12
  from nltk.translate.meteor_score import meteor_score
 
13
  from deepeval.test_case import LLMTestCase
14
  from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, GEval
15
  from deepeval.models import DeepEvalBaseLLM
16
  import google.generativeai as genai
17
+ from groq import Groq
18
  import os
19
+ from io import StringIO
 
20
 
21
  # Download required NLTK data
22
  nltk.download('punkt', quiet=True)
23
  nltk.download('wordnet', quiet=True)
24
 
25
+ # Configuration
26
+ GEMINI_API_KEY = "your_gemini_api_key" # Replace with your key
27
+ GROQ_API_KEY = "your_groq_api_key" # Replace with your key
28
 
29
+ # Initialize APIs
30
+ genai.configure(api_key=GEMINI_API_KEY)
31
+ groq_client = Groq(api_key=GROQ_API_KEY)
32
 
33
  class LLMProvider:
34
  """Abstract base class for LLM providers"""
35
+ def __init__(self, model_name: str):
36
+ self.model_name = model_name
37
 
38
  def generate(self, prompt: str) -> str:
39
  raise NotImplementedError
40
 
41
  def get_model_name(self) -> str:
42
+ return self.model_name
43
 
44
  class GeminiProvider(LLMProvider):
45
  """Gemini implementation"""
46
+ def __init__(self, model_name: str = "gemini-1.5-flash"):
47
+ super().__init__(model_name)
 
48
  self.model = genai.GenerativeModel(model_name)
49
 
50
  def generate(self, prompt: str) -> str:
 
52
  response = self.model.generate_content(prompt)
53
  return response.text.strip()
54
  except Exception as e:
55
+ return f"Error generating with Gemini: {str(e)}"
 
 
 
 
56
 
57
  class GroqProvider(LLMProvider):
58
+ """Groq implementation for LLaMA models"""
59
+ def __init__(self, model_name: str = "llama3-70b-8192"):
60
+ super().__init__(model_name)
 
 
61
 
62
  def generate(self, prompt: str) -> str:
63
+ try:
64
+ chat_completion = groq_client.chat.completions.create(
65
+ messages=[
66
+ {"role": "user", "content": prompt}
67
+ ],
68
+ model=self.model_name,
69
+ temperature=0.7,
70
+ max_tokens=2048
71
+ )
72
+ return chat_completion.choices[0].message.content.strip()
73
+ except Exception as e:
74
+ return f"Error generating with Groq: {str(e)}"
75
 
76
+ class DeepEvalLLMWrapper(DeepEvalBaseLLM):
77
+ """Wrapper for DeepEval to work with our providers"""
78
+ def __init__(self, provider: LLMProvider):
79
+ self.provider = provider
80
 
81
  def load_model(self):
82
+ return self.provider
83
 
84
  def generate(self, prompt: str) -> str:
85
+ return self.provider.generate(prompt)
 
 
 
86
 
87
  def get_model_name(self) -> str:
88
+ return self.provider.get_model_name()
89
 
90
  def clean_text(text: str) -> str:
91
+ """Clean text by fixing encoding and normalizing"""
 
 
 
 
 
 
 
 
92
  if not text or not isinstance(text, str):
93
  return ""
94
+
95
+ # Fix encoding artifacts
96
  text = ftfy.fix_text(text)
97
  text = unicodedata.normalize('NFKD', text)
98
 
99
+ # Fix quotes and other common issues
100
+ text = text.replace('Ò€œ', '"').replace('Ò€', '"')
101
+ text = text.replace('Γ’β‚¬β€œ', '-').replace('Ò€”', '-')
102
+ text = text.replace('Γ’β‚¬Λœ', "'").replace('Ò€ℒ', "'")
103
 
104
+ # Remove non-ASCII characters
105
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
106
 
107
  # Normalize whitespace
108
  text = ' '.join(text.split())
109
 
110
+ return text.strip()
111
 
112
+ def evaluate_metrics(input_text: str, candidate_text: str, reference_text: str) -> Dict:
113
+ """Run comprehensive evaluation on the generated text"""
 
114
 
115
+ # Clean the texts
116
+ cleaned_input = clean_text(input_text)
117
+ cleaned_candidate = clean_text(candidate_text)
118
+ cleaned_reference = clean_text(reference_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  results = {}
121
 
122
+ # Traditional metrics
123
  try:
 
 
 
 
 
 
 
 
124
  # BLEU Score
125
+ smooth = SmoothingFunction().method4
126
+ bleu_score = sentence_bleu(
127
+ [cleaned_reference.split()],
128
+ cleaned_candidate.split(),
129
+ smoothing_function=smooth
130
+ )
131
+ results["BLEU"] = bleu_score
132
+
133
  # ROUGE Score
134
+ rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
135
+ rouge_scores = rouge_scorer_obj.score(cleaned_reference, cleaned_candidate)
136
+ rouge_avg = (rouge_scores['rouge1'].fmeasure +
137
+ rouge_scores['rouge2'].fmeasure +
138
+ rouge_scores['rougeL'].fmeasure) / 3
139
+ results["ROUGE"] = rouge_avg
140
+
 
 
 
 
141
  # METEOR Score
142
+ meteor = meteor_score([cleaned_reference.split()], cleaned_candidate.split())
143
+ results["METEOR"] = meteor
144
+
145
+ # BERT Score
146
+ P, R, F1 = bert_score([cleaned_candidate], [cleaned_reference], lang="en", verbose=False)
147
+ results["BERTScore"] = F1.item()
148
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  except Exception as e:
150
+ results["Error"] = f"Traditional metrics error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ # LLM-as-judge metrics (using Gemini for consistency)
153
+ try:
154
+ judge_provider = GeminiProvider("gemini-1.5-flash")
155
+ judge_wrapper = DeepEvalLLMWrapper(judge_provider)
156
+
157
+ test_case = LLMTestCase(
158
+ input=cleaned_input,
159
+ actual_output=cleaned_candidate,
160
+ expected_output=cleaned_reference
161
+ )
162
+
163
+ # Answer Relevancy
164
+ answer_rel = AnswerRelevancyMetric(model=judge_wrapper)
165
+ answer_rel.measure(test_case)
166
+ results["AnswerRelevancy"] = answer_rel.score
167
+
168
+ # Faithfulness
169
+ faith = FaithfulnessMetric(model=judge_wrapper)
170
+ faith.measure(test_case)
171
+ results["Faithfulness"] = faith.score
172
+
173
+ # GEval
174
+ geval = GEval(
175
+ name="OverallQuality",
176
+ criteria="Evaluate if the candidate response is accurate, complete, and well-written.",
177
+ evaluation_params=[
178
+ "input", "actual_output", "expected_output"
179
+ ],
180
+ model=judge_wrapper
181
+ )
182
+ geval.measure(test_case)
183
+ results["GEval"] = geval.score
184
 
185
+ except Exception as e:
186
+ results["LLM_Judge_Error"] = f"LLM-as-judge metrics error: {str(e)}"
187
+
188
+ # Normalization and Hybrid Score
189
  normalization_ranges = {
190
  "AnswerRelevancy": (0.0, 1.0),
191
  "Faithfulness": (0.0, 1.0),
 
196
  "METEOR": (0.0, 0.6)
197
  }
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  weights = {
200
  "AnswerRelevancy": 0.10,
201
  "Faithfulness": 0.10,
 
206
  "METEOR": 0.15
207
  }
208
 
209
+ # Normalize scores
210
+ normalized_scores = {}
211
+ for metric, value in results.items():
212
+ if metric in normalization_ranges and isinstance(value, (int, float)):
213
+ min_v, max_v = normalization_ranges[metric]
214
+ if max_v > min_v: # Avoid division by zero
215
+ norm = max(min((value - min_v) / (max_v - min_v), 1.0), 0.0)
216
+ normalized_scores[metric] = norm
217
+ else:
218
+ normalized_scores[metric] = 0.5
219
+ elif isinstance(value, (int, float)):
220
+ normalized_scores[metric] = value
221
+
222
+ # Calculate weighted average
223
+ if normalized_scores:
224
+ weighted_sum = sum(normalized_scores.get(m, 0) * w for m, w in weights.items())
225
+ total_weight = sum(w for m, w in weights.items() if m in normalized_scores)
226
+ results["WeightedAverage"] = weighted_sum / total_weight if total_weight > 0 else 0.0
227
+ else:
228
+ results["WeightedAverage"] = 0.0
229
 
230
+ return results
231
+
232
+ def process_single_text(input_text: str, model_choice: str) -> Tuple[str, str, Dict]:
233
+ """Process a single text input"""
234
+ if not input_text or len(input_text.strip()) < 10:
235
+ return "", "", {"Error": "Input text too short"}
236
+
237
+ # Choose model
238
+ if model_choice == "Gemini":
239
+ provider = GeminiProvider("gemini-1.5-flash")
240
+ elif model_choice == "LLaMA-3-70b":
241
+ provider = GroqProvider("llama3-70b-8192")
242
+ else: # LLaMA-3-8b
243
+ provider = GroqProvider("llama3-8b-8192")
244
+
245
+ # Generate candidate
246
+ prompt = f"""Rewrite the following paragraph in a fresh, concise, and professional style while preserving its full meaning and key information:
247
 
248
+ {input_text}
249
+
250
+ Provide only the rewritten text without any additional commentary."""
 
251
 
252
+ candidate = provider.generate(prompt)
 
 
 
 
 
 
 
 
 
253
 
254
+ # Use cleaned input as reference (simulating human-quality standard)
255
+ reference = clean_text(input_text)
 
256
 
257
+ # Evaluate
258
+ scores = evaluate_metrics(input_text, candidate, reference)
259
 
260
+ return candidate, reference, scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
+ def process_file(file_obj, model_choice: str) -> Tuple[pd.DataFrame, str]:
263
+ """Process a CSV file with multiple articles"""
 
 
 
 
 
 
 
 
 
 
 
 
264
  try:
265
+ # Read the file
266
+ content = file_obj.read().decode('utf-8')
267
+ df = pd.read_csv(StringIO(content))
268
 
269
+ # Assume first column is the text
270
+ text_column = df.columns[0]
 
 
 
 
271
 
272
+ results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ for idx, row in df.iterrows():
275
+ text = str(row[text_column])
276
+ candidate, reference, scores = process_single_text(text, model_choice)
277
+
278
+ result_row = {
279
+ 'Original_Text': text,
280
+ 'Generated_Candidate': candidate,
281
+ 'Reference_Text': reference
282
+ }
283
+ result_row.update(scores)
284
+ results.append(result_row)
285
 
286
+ results_df = pd.DataFrame(results)
287
+ return results_df, "File processed successfully!"
288
 
289
  except Exception as e:
290
+ return pd.DataFrame(), f"Error processing file: {str(e)}"
 
291
 
292
  def create_gradio_interface():
293
+ """Create the Gradio interface"""
294
 
295
  with gr.Blocks(title="LLM Evaluation Framework") as demo:
296
+ gr.Markdown("# πŸ“Š LLM Evaluation Framework for Professional Content Rewriting")
297
+ gr.Markdown("Evaluate and compare LLM-generated content using multiple metrics. Choose between Gemini and LLaMA models.")
298
 
299
  with gr.Tabs():
300
+ with gr.Tab("Single Text Processing"):
 
 
301
  with gr.Row():
302
  with gr.Column(scale=2):
303
  input_text = gr.Textbox(
304
+ label="Input Text",
305
+ placeholder="Enter the text you want to rewrite...",
306
  lines=10
307
  )
308
+
309
+ model_choice_single = gr.Radio(
310
+ ["Gemini", "LLaMA-3-70b", "LLaMA-3-8b"],
311
+ label="Choose Model",
312
+ value="Gemini"
 
313
  )
314
+
315
+ submit_btn = gr.Button("Generate & Evaluate", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
+ with gr.Column(scale=3):
318
+ gr.Markdown("### Results")
319
+
320
+ with gr.Tabs():
321
+ with gr.Tab("Generated Text"):
322
+ candidate_output = gr.Textbox(
323
+ label="Generated Candidate",
324
+ lines=10,
325
+ show_copy_button=True
326
+ )
327
+ reference_output = gr.Textbox(
328
+ label="Reference Text (Cleaned Input)",
329
+ lines=5,
330
+ show_copy_button=True
331
+ )
332
+
333
+ with gr.Tab("Evaluation Scores"):
334
+ scores_output = gr.JSON(label="Detailed Scores")
335
+
336
+ weighted_avg = gr.Number(
337
+ label="Weighted Average Score (0-1)",
338
+ precision=4
339
+ )
340
+
341
+ interpretation = gr.Textbox(
342
+ label="Interpretation",
343
+ interactive=False
344
+ )
345
+
346
+ with gr.Tab("Batch Processing (CSV File)"):
 
 
 
 
 
 
 
 
 
 
 
 
347
  with gr.Row():
348
+ with gr.Column(scale=1):
349
  file_input = gr.File(
350
+ label="Upload CSV File",
351
+ file_types=['.csv']
352
  )
353
+
354
+ model_choice_file = gr.Radio(
355
+ ["Gemini", "LLaMA-3-70b", "LLaMA-3-8b"],
356
+ label="Choose Model for Batch Processing",
357
+ value="Gemini"
 
358
  )
359
+
360
+ process_file_btn = gr.Button("Process File", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
+ with gr.Column(scale=2):
363
+ gr.Markdown("### Results")
364
+ file_results = gr.Dataframe(
365
+ label="Evaluation Results",
366
+ interactive=False
367
+ )
368
+ file_status = gr.Textbox(label="Status")
369
+
370
+ # Examples
371
+ gr.Examples(
372
+ examples=[
373
+ ["The immune system plays a crucial role in protecting the human body from pathogens such as bacteria, viruses, and other harmful invaders. It is composed of innate and adaptive components that work together to detect and eliminate foreign threats.", "Gemini"],
374
+ ["Climate change is one of the most pressing challenges facing humanity today. Rising global temperatures have led to severe weather patterns, including more intense storms, droughts, and heatwaves.", "LLaMA-3-70b"]
375
+ ],
376
+ inputs=[input_text, model_choice_single],
377
+ outputs=[candidate_output, reference_output, scores_output, weighted_avg, interpretation]
378
+ )
379
+
380
+ # Event handlers
381
+ def handle_single_process(text, model):
382
+ if not text:
383
+ return "", "", {}, 0, "Please enter some text."
384
+
385
+ candidate, reference, scores = process_single_text(text, model)
386
+
387
+ # Get weighted average
388
+ weighted_avg_val = scores.get("WeightedAverage", 0)
389
+
390
+ # Interpretation
391
+ if weighted_avg_val >= 0.85:
392
+ interpretation_text = "βœ… Outstanding performance (A) - ready for professional use"
393
+ elif weighted_avg_val >= 0.70:
394
+ interpretation_text = "βœ… Strong performance (B) - good quality with minor improvements"
395
+ elif weighted_avg_val >= 0.50:
396
+ interpretation_text = "⚠️ Adequate performance (C) - usable but needs refinement"
397
+ elif weighted_avg_val >= 0.30:
398
+ interpretation_text = "❌ Weak performance (D) - requires significant revision"
399
+ else:
400
+ interpretation_text = "❌ Poor performance (F) - likely needs complete rewriting"
401
+
402
+ return candidate, reference, scores, weighted_avg_val, interpretation_text
403
 
404
+ def handle_file_process(file, model):
405
+ if file is None:
406
+ return pd.DataFrame(), "Please upload a file."
407
+ return process_file(file, model)
408
 
409
+ submit_btn.click(
410
+ fn=handle_single_process,
411
+ inputs=[input_text, model_choice_single],
412
+ outputs=[candidate_output, reference_output, scores_output, weighted_avg, interpretation]
413
+ )
414
 
415
+ process_file_btn.click(
416
+ fn=handle_file_process,
417
+ inputs=[file_input, model_choice_file],
418
+ outputs=[file_results, file_status]
419
+ )
420
 
421
+ gr.Markdown("""
422
+ ## πŸ“ How to Use
423
+
424
+ 1. **Single Text Processing**: Enter your text and choose a model to generate a professional rewrite.
425
+ 2. **Batch Processing**: Upload a CSV file with one article per row in the first column.
426
+ 3. **Model Options**:
427
+ - **Gemini**: Google's advanced language model
428
+ - **LLaMA-3-70b**: Large Meta model (70B parameters)
429
+ - **LLaMA-3-8b**: Smaller Meta model (8B parameters)
430
+
431
+ ## πŸ“Š Evaluation Metrics
432
+
433
+ The system evaluates performance using multiple metrics:
434
+ - **Traditional**: BLEU, ROUGE, METEOR (n-gram overlap)
435
+ - **Semantic**: BERTScore (embedding similarity)
436
+ - **LLM-as-Judge**: AnswerRelevancy, Faithfulness, GEval
437
+ - **Final Score**: Weighted average of all metrics (0-1 scale)
438
  """)
439
 
440
  return demo
 
442
  # Launch the app
443
  if __name__ == "__main__":
444
  app = create_gradio_interface()
445
+ app.launch(share=True)