Sa-m commited on
Commit
f9128f8
Β·
verified Β·
1 Parent(s): 3beb0c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +576 -340
app.py CHANGED
@@ -3,33 +3,71 @@ import pandas as pd
3
  import numpy as np
4
  import re
5
  import unicodedata
6
- from typing import Dict, List, Tuple
7
  import ftfy
8
  import nltk
9
- from bert_score import score as bert_score
10
- from rouge_score import rouge_scorer
 
 
11
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 
 
12
  from nltk.translate.meteor_score import meteor_score
13
- from deepeval.test_case import LLMTestCase
14
- from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, GEval
15
- from deepeval.models import DeepEvalBaseLLM
16
  import google.generativeai as genai
17
  from groq import Groq
18
- import os
19
- from io import StringIO
20
 
21
- # Download required NLTK data
22
  nltk.download('punkt', quiet=True)
23
  nltk.download('wordnet', quiet=True)
24
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Initialize APIs
28
- genai.configure(api_key=GEMINI_API_KEY)
29
- groq_client = Groq(api_key=GROQ_API_KEY)
 
 
 
 
 
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  class LLMProvider:
32
- """Abstract base class for LLM providers"""
33
  def __init__(self, model_name: str):
34
  self.model_name = model_name
35
 
@@ -40,404 +78,602 @@ class LLMProvider:
40
  return self.model_name
41
 
42
  class GeminiProvider(LLMProvider):
43
- """Gemini implementation"""
44
- def __init__(self, model_name: str = "gemini-1.5-flash"):
45
  super().__init__(model_name)
46
- self.model = genai.GenerativeModel(model_name)
 
 
 
 
 
 
47
 
48
  def generate(self, prompt: str) -> str:
 
 
 
49
  try:
50
  response = self.model.generate_content(prompt)
51
- return response.text.strip()
52
  except Exception as e:
53
  return f"Error generating with Gemini: {str(e)}"
54
 
55
  class GroqProvider(LLMProvider):
56
- """Groq implementation for LLaMA models"""
57
  def __init__(self, model_name: str = "llama3-70b-8192"):
58
  super().__init__(model_name)
 
59
 
60
  def generate(self, prompt: str) -> str:
 
 
 
61
  try:
62
  chat_completion = groq_client.chat.completions.create(
63
  messages=[
64
  {"role": "user", "content": prompt}
65
  ],
66
  model=self.model_name,
67
- temperature=0.7,
68
- max_tokens=2048
69
  )
70
- return chat_completion.choices[0].message.content.strip()
71
  except Exception as e:
72
  return f"Error generating with Groq: {str(e)}"
73
 
74
- class DeepEvalLLMWrapper(DeepEvalBaseLLM):
75
- """Wrapper for DeepEval to work with our providers"""
76
- def __init__(self, provider: LLMProvider):
77
- self.provider = provider
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- def load_model(self):
80
- return self.provider
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- def generate(self, prompt: str) -> str:
83
- return self.provider.generate(prompt)
 
 
84
 
85
- def get_model_name(self) -> str:
86
- return self.provider.get_model_name()
 
87
 
88
- def clean_text(text: str) -> str:
89
- """Clean text by fixing encoding and normalizing"""
90
- if not text or not isinstance(text, str):
91
- return ""
92
-
93
- # Fix encoding artifacts
94
- text = ftfy.fix_text(text)
95
- text = unicodedata.normalize('NFKD', text)
96
 
97
- # Fix quotes and other common issues
98
- text = text.replace('Ò€œ', '"').replace('Ò€', '"')
99
- text = text.replace('Γ’β‚¬β€œ', '-').replace('Ò€”', '-')
100
- text = text.replace('Γ’β‚¬Λœ', "'").replace('Ò€ℒ', "'")
101
 
102
- # Remove non-ASCII characters
103
- text = re.sub(r'[^\x00-\x7F]+', ' ', text)
 
 
 
 
104
 
105
- # Normalize whitespace
106
- text = ' '.join(text.split())
 
107
 
108
- return text.strip()
109
-
110
- def evaluate_metrics(input_text: str, candidate_text: str, reference_text: str) -> Dict:
111
- """Run comprehensive evaluation on the generated text"""
112
 
113
- # Clean the texts
114
- cleaned_input = clean_text(input_text)
115
- cleaned_candidate = clean_text(candidate_text)
116
- cleaned_reference = clean_text(reference_text)
117
 
 
118
  results = {}
119
 
120
- # Traditional metrics
121
  try:
122
- # BLEU Score
123
  smooth = SmoothingFunction().method4
124
- bleu_score = sentence_bleu(
125
- [cleaned_reference.split()],
126
  cleaned_candidate.split(),
127
  smoothing_function=smooth
128
  )
129
- results["BLEU"] = bleu_score
130
-
131
- # ROUGE Score
132
- rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
133
- rouge_scores = rouge_scorer_obj.score(cleaned_reference, cleaned_candidate)
134
- rouge_avg = (rouge_scores['rouge1'].fmeasure +
135
- rouge_scores['rouge2'].fmeasure +
136
- rouge_scores['rougeL'].fmeasure) / 3
137
- results["ROUGE"] = rouge_avg
138
-
139
- # METEOR Score
140
- meteor = meteor_score([cleaned_reference.split()], cleaned_candidate.split())
141
- results["METEOR"] = meteor
142
-
143
- # BERT Score
144
- P, R, F1 = bert_score([cleaned_candidate], [cleaned_reference], lang="en", verbose=False)
145
- results["BERTScore"] = F1.item()
146
-
147
  except Exception as e:
148
- results["Error"] = f"Traditional metrics error: {str(e)}"
 
149
 
150
- # LLM-as-judge metrics (using Gemini for consistency)
151
  try:
152
- judge_provider = GeminiProvider("gemini-1.5-flash")
153
- judge_wrapper = DeepEvalLLMWrapper(judge_provider)
154
-
155
- test_case = LLMTestCase(
156
- input=cleaned_input,
157
- actual_output=cleaned_candidate,
158
- expected_output=cleaned_reference
 
 
 
 
 
 
 
159
  )
160
-
161
- # Answer Relevancy
162
- answer_rel = AnswerRelevancyMetric(model=judge_wrapper)
163
- answer_rel.measure(test_case)
164
- results["AnswerRelevancy"] = answer_rel.score
165
-
166
- # Faithfulness
167
- faith = FaithfulnessMetric(model=judge_wrapper)
168
- faith.measure(test_case)
169
- results["Faithfulness"] = faith.score
170
-
171
- # GEval
172
- geval = GEval(
173
- name="OverallQuality",
174
- criteria="Evaluate if the candidate response is accurate, complete, and well-written.",
175
- evaluation_params=[
176
- "input", "actual_output", "expected_output"
177
- ],
178
- model=judge_wrapper
179
  )
180
- geval.measure(test_case)
181
- results["GEval"] = geval.score
182
-
183
  except Exception as e:
184
- results["LLM_Judge_Error"] = f"LLM-as-judge metrics error: {str(e)}"
185
-
186
- # Normalization and Hybrid Score
187
- normalization_ranges = {
188
- "AnswerRelevancy": (0.0, 1.0),
189
- "Faithfulness": (0.0, 1.0),
190
- "GEval": (0.0, 1.0),
191
- "BERTScore": (0.7, 0.95),
192
- "ROUGE": (0.0, 0.6),
193
- "BLEU": (0.0, 0.4),
194
- "METEOR": (0.0, 0.6)
195
- }
196
 
197
- weights = {
198
- "AnswerRelevancy": 0.10,
199
- "Faithfulness": 0.10,
200
- "GEval": 0.025,
201
- "BERTScore": 0.20,
202
- "ROUGE": 0.15,
203
- "BLEU": 0.025,
204
- "METEOR": 0.15
205
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- # Normalize scores
208
- normalized_scores = {}
209
- for metric, value in results.items():
210
- if metric in normalization_ranges and isinstance(value, (int, float)):
211
- min_v, max_v = normalization_ranges[metric]
212
- if max_v > min_v: # Avoid division by zero
213
- norm = max(min((value - min_v) / (max_v - min_v), 1.0), 0.0)
214
- normalized_scores[metric] = norm
215
- else:
216
- normalized_scores[metric] = 0.5
217
- elif isinstance(value, (int, float)):
218
- normalized_scores[metric] = value
219
-
220
- # Calculate weighted average
221
- if normalized_scores:
222
- weighted_sum = sum(normalized_scores.get(m, 0) * w for m, w in weights.items())
223
- total_weight = sum(w for m, w in weights.items() if m in normalized_scores)
224
- results["WeightedAverage"] = weighted_sum / total_weight if total_weight > 0 else 0.0
225
  else:
226
- results["WeightedAverage"] = 0.0
227
 
228
- return results
 
 
 
 
 
 
229
 
230
- def process_single_text(input_text: str, model_choice: str) -> Tuple[str, str, Dict]:
231
- """Process a single text input"""
232
- if not input_text or len(input_text.strip()) < 10:
233
- return "", "", {"Error": "Input text too short"}
234
 
235
- # Choose model
236
- if model_choice == "Gemini":
237
- provider = GeminiProvider("gemini-1.5-flash")
238
- elif model_choice == "LLaMA-3-70b":
239
- provider = GroqProvider("llama3-70b-8192")
240
- else: # LLaMA-3-8b
241
- provider = GroqProvider("llama3-8b-8192")
242
-
243
- # Generate candidate
244
- prompt = f"""Rewrite the following paragraph in a fresh, concise, and professional style while preserving its full meaning and key information:
245
-
246
- {input_text}
247
-
248
- Provide only the rewritten text without any additional commentary."""
249
 
250
- candidate = provider.generate(prompt)
 
 
 
 
 
 
251
 
252
- # Use cleaned input as reference (simulating human-quality standard)
253
- reference = clean_text(input_text)
 
254
 
255
- # Evaluate
256
- scores = evaluate_metrics(input_text, candidate, reference)
257
 
258
- return candidate, reference, scores
259
-
260
- def process_file(file_obj, model_choice: str) -> Tuple[pd.DataFrame, str]:
261
- """Process a CSV file with multiple articles"""
262
- try:
263
- # Read the file
264
- content = file_obj.read().decode('utf-8')
265
- df = pd.read_csv(StringIO(content))
266
-
267
- # Assume first column is the text
268
- text_column = df.columns[0]
269
-
270
- results = []
271
-
272
- for idx, row in df.iterrows():
273
- text = str(row[text_column])
274
- candidate, reference, scores = process_single_text(text, model_choice)
275
 
276
- result_row = {
277
- 'Original_Text': text,
278
- 'Generated_Candidate': candidate,
279
- 'Reference_Text': reference
280
- }
281
- result_row.update(scores)
282
- results.append(result_row)
283
-
284
- results_df = pd.DataFrame(results)
285
- return results_df, "File processed successfully!"
286
-
287
- except Exception as e:
288
- return pd.DataFrame(), f"Error processing file: {str(e)}"
289
-
290
- def create_gradio_interface():
291
- """Create the Gradio interface"""
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- with gr.Blocks(title="LLM Evaluation Framework") as demo:
294
- gr.Markdown("# πŸ“Š LLM Evaluation Framework for Professional Content Rewriting")
295
- gr.Markdown("Evaluate and compare LLM-generated content using multiple metrics. Choose between Gemini and LLaMA models.")
296
-
297
- with gr.Tabs():
298
- with gr.Tab("Single Text Processing"):
299
- with gr.Row():
300
- with gr.Column(scale=2):
301
- input_text = gr.Textbox(
302
- label="Input Text",
303
- placeholder="Enter the text you want to rewrite...",
304
- lines=10
305
- )
306
-
307
- model_choice_single = gr.Radio(
308
- ["Gemini", "LLaMA-3-70b", "LLaMA-3-8b"],
309
- label="Choose Model",
310
- value="Gemini"
311
- )
312
-
313
- submit_btn = gr.Button("Generate & Evaluate", variant="primary")
 
 
 
 
 
314
 
315
- with gr.Column(scale=3):
316
- gr.Markdown("### Results")
317
-
318
- with gr.Tabs():
319
- with gr.Tab("Generated Text"):
320
- candidate_output = gr.Textbox(
321
- label="Generated Candidate",
322
- lines=10,
323
- show_copy_button=True
324
- )
325
- reference_output = gr.Textbox(
326
- label="Reference Text (Cleaned Input)",
327
- lines=5,
328
- show_copy_button=True
329
- )
330
-
331
- with gr.Tab("Evaluation Scores"):
332
- scores_output = gr.JSON(label="Detailed Scores")
333
-
334
- weighted_avg = gr.Number(
335
- label="Weighted Average Score (0-1)",
336
- precision=4
337
- )
338
-
339
- interpretation = gr.Textbox(
340
- label="Interpretation",
341
- interactive=False
342
- )
343
-
344
- with gr.Tab("Batch Processing (CSV File)"):
345
- with gr.Row():
346
- with gr.Column(scale=1):
347
- file_input = gr.File(
348
- label="Upload CSV File",
349
- file_types=['.csv']
350
- )
351
-
352
- model_choice_file = gr.Radio(
353
- ["Gemini", "LLaMA-3-70b", "LLaMA-3-8b"],
354
- label="Choose Model for Batch Processing",
355
- value="Gemini"
356
- )
357
-
358
- process_file_btn = gr.Button("Process File", variant="primary")
359
 
360
- with gr.Column(scale=2):
361
- gr.Markdown("### Results")
362
- file_results = gr.Dataframe(
363
- label="Evaluation Results",
364
- interactive=False
365
- )
366
- file_status = gr.Textbox(label="Status")
367
-
368
- # Examples
369
- gr.Examples(
370
- examples=[
371
- ["The immune system plays a crucial role in protecting the human body from pathogens such as bacteria, viruses, and other harmful invaders. It is composed of innate and adaptive components that work together to detect and eliminate foreign threats.", "Gemini"],
372
- ["Climate change is one of the most pressing challenges facing humanity today. Rising global temperatures have led to severe weather patterns, including more intense storms, droughts, and heatwaves.", "LLaMA-3-70b"]
373
- ],
374
- inputs=[input_text, model_choice_single],
375
- outputs=[candidate_output, reference_output, scores_output, weighted_avg, interpretation]
376
- )
377
-
378
- # Event handlers
379
- def handle_single_process(text, model):
380
- if not text:
381
- return "", "", {}, 0, "Please enter some text."
382
 
383
- candidate, reference, scores = process_single_text(text, model)
 
384
 
385
- # Get weighted average
386
- weighted_avg_val = scores.get("WeightedAverage", 0)
387
 
388
- # Interpretation
389
- if weighted_avg_val >= 0.85:
390
- interpretation_text = "βœ… Outstanding performance (A) - ready for professional use"
391
- elif weighted_avg_val >= 0.70:
392
- interpretation_text = "βœ… Strong performance (B) - good quality with minor improvements"
393
- elif weighted_avg_val >= 0.50:
394
- interpretation_text = "⚠️ Adequate performance (C) - usable but needs refinement"
395
- elif weighted_avg_val >= 0.30:
396
- interpretation_text = "❌ Weak performance (D) - requires significant revision"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  else:
398
- interpretation_text = "❌ Poor performance (F) - likely needs complete rewriting"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
- return candidate, reference, scores, weighted_avg_val, interpretation_text
401
-
402
- def handle_file_process(file, model):
403
- if file is None:
404
- return pd.DataFrame(), "Please upload a file."
405
- return process_file(file, model)
406
-
407
- submit_btn.click(
408
- fn=handle_single_process,
409
- inputs=[input_text, model_choice_single],
410
- outputs=[candidate_output, reference_output, scores_output, weighted_avg, interpretation]
411
- )
412
-
413
- process_file_btn.click(
414
- fn=handle_file_process,
415
- inputs=[file_input, model_choice_file],
416
- outputs=[file_results, file_status]
417
- )
418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  gr.Markdown("""
420
- ## πŸ“ How to Use
421
 
422
- 1. **Single Text Processing**: Enter your text and choose a model to generate a professional rewrite.
423
- 2. **Batch Processing**: Upload a CSV file with one article per row in the first column.
424
- 3. **Model Options**:
425
- - **Gemini**: Google's advanced language model
426
- - **LLaMA-3-70b**: Large Meta model (70B parameters)
427
- - **LLaMA-3-8b**: Smaller Meta model (8B parameters)
428
 
429
- ## πŸ“Š Evaluation Metrics
 
 
 
 
430
 
431
- The system evaluates performance using multiple metrics:
432
- - **Traditional**: BLEU, ROUGE, METEOR (n-gram overlap)
433
- - **Semantic**: BERTScore (embedding similarity)
434
- - **LLM-as-Judge**: AnswerRelevancy, Faithfulness, GEval
435
- - **Final Score**: Weighted average of all metrics (0-1 scale)
 
 
 
 
 
 
436
  """)
437
-
438
- return demo
439
 
440
  # Launch the app
441
  if __name__ == "__main__":
442
- app = create_gradio_interface()
443
- app.launch(share=True)
 
 
 
 
3
  import numpy as np
4
  import re
5
  import unicodedata
 
6
  import ftfy
7
  import nltk
8
+ import os
9
+ import json
10
+ import time
11
+ from typing import Dict, Any, List, Tuple, Optional
12
  from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
13
+ from rouge_score import rouge_scorer
14
+ from bert_score import score as bert_score
15
  from nltk.translate.meteor_score import meteor_score
 
 
 
16
  import google.generativeai as genai
17
  from groq import Groq
18
+ from dotenv import load_dotenv
 
19
 
20
+ # Download necessary NLTK resources
21
  nltk.download('punkt', quiet=True)
22
  nltk.download('wordnet', quiet=True)
23
 
24
+ # Load environment variables
25
+ load_dotenv()
26
 
27
+ # Initialize API clients (with graceful fallback if keys missing)
28
+ try:
29
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
30
+ if GEMINI_API_KEY:
31
+ genai.configure(api_key=GEMINI_API_KEY)
32
+ else:
33
+ print("Warning: GEMINI_API_KEY not found in environment variables")
34
+ except Exception as e:
35
+ print(f"Error configuring Gemini: {str(e)}")
36
+ GEMINI_API_KEY = None
37
 
38
+ try:
39
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
40
+ if GROQ_API_KEY:
41
+ groq_client = Groq(api_key=GROQ_API_KEY)
42
+ else:
43
+ print("Warning: GROQ_API_KEY not found in environment variables")
44
+ groq_client = None
45
+ except Exception as e:
46
+ print(f"Error configuring Groq: {str(e)}")
47
+ groq_client = None
48
 
49
+ # Text cleaning function
50
+ def clean_text(text: str) -> str:
51
+ """Clean text by fixing encoding issues and standardizing format"""
52
+ if not isinstance(text, str) or not text.strip():
53
+ return ""
54
+
55
+ text = ftfy.fix_text(text) # Fixes encoding artifacts
56
+ text = unicodedata.normalize('NFKD', text)
57
+ # Replace common smart quotes and dashes
58
+ replacements = {
59
+ 'Ò€œ': '"', 'Ò€': '"', 'Γ’β‚¬β€œ': '-', 'Ò€”': '--',
60
+ 'Ò€’': '*', 'Ò€¦': '...', 'Γ‚': ''
61
+ }
62
+ for old, new in replacements.items():
63
+ text = text.replace(old, new)
64
+ # Remove non-ASCII characters
65
+ text = re.sub(r'[^\x00-\x7F]+', '', text)
66
+ # Normalize whitespace
67
+ return ' '.join(text.split())
68
+
69
+ # LLM Provider classes
70
  class LLMProvider:
 
71
  def __init__(self, model_name: str):
72
  self.model_name = model_name
73
 
 
78
  return self.model_name
79
 
80
  class GeminiProvider(LLMProvider):
81
+ def __init__(self, model_name: str = "gemini-1.5-flash-latest"):
 
82
  super().__init__(model_name)
83
+ self.available = bool(GEMINI_API_KEY)
84
+ if self.available:
85
+ try:
86
+ self.model = genai.GenerativeModel(model_name)
87
+ except Exception as e:
88
+ print(f"Error initializing Gemini model: {str(e)}")
89
+ self.available = False
90
 
91
  def generate(self, prompt: str) -> str:
92
+ if not self.available:
93
+ return "Error: Gemini API not configured properly. Check your API key."
94
+
95
  try:
96
  response = self.model.generate_content(prompt)
97
+ return response.text
98
  except Exception as e:
99
  return f"Error generating with Gemini: {str(e)}"
100
 
101
  class GroqProvider(LLMProvider):
 
102
  def __init__(self, model_name: str = "llama3-70b-8192"):
103
  super().__init__(model_name)
104
+ self.available = bool(groq_client)
105
 
106
  def generate(self, prompt: str) -> str:
107
+ if not self.available:
108
+ return "Error: Groq API not configured properly. Check your API key."
109
+
110
  try:
111
  chat_completion = groq_client.chat.completions.create(
112
  messages=[
113
  {"role": "user", "content": prompt}
114
  ],
115
  model=self.model_name,
116
+ temperature=0.3
 
117
  )
118
+ return chat_completion.choices[0].message.content
119
  except Exception as e:
120
  return f"Error generating with Groq: {str(e)}"
121
 
122
+ # Prompt templates
123
+ PROMPT_TEMPLATES = {
124
+ "Strategic Narrative Architect": """Role: Strategic Narrative Architect
125
+ You are a professional content writer with expertise in creating engaging, well-structured narratives.
126
+ Your task is to rewrite the following text in a professional, engaging style while preserving all key facts and information:
127
+
128
+ {text}
129
+
130
+ Instructions:
131
+ 1. Maintain all factual information and key details
132
+ 2. Improve structure and flow for better readability
133
+ 3. Enhance engagement through appropriate storytelling techniques
134
+ 4. Use professional language appropriate for the content domain
135
+ 5. Ensure the output is concise yet comprehensive
136
+
137
+ Rewritten content:""",
138
 
139
+ "Precision Storyteller": """Role: Precision Storyteller
140
+ You are a professional editor focused on accuracy, clarity, and precision.
141
+ Your task is to rewrite the following text with maximum factual accuracy while improving clarity:
142
+
143
+ {text}
144
+
145
+ Instructions:
146
+ 1. Preserve all factual information with absolute precision
147
+ 2. Correct any grammatical errors or awkward phrasing
148
+ 3. Ensure logical flow and coherence
149
+ 4. Use clear, concise language without unnecessary embellishment
150
+ 5. Maintain professional tone appropriate for the content domain
151
+
152
+ Rewritten content:"""
153
+ }
154
+
155
+ # Metric normalization ranges
156
+ NORMALIZATION_RANGES = {
157
+ "AnswerRelevancy": (0.0, 1.0),
158
+ "Faithfulness": (0.0, 1.0),
159
+ "GEval": (0.0, 1.0),
160
+ "BERTScore": (0.7, 0.95),
161
+ "ROUGE": (0.0, 0.6),
162
+ "BLEU": (0.0, 0.4),
163
+ "METEOR": (0.0, 0.6)
164
+ }
165
+
166
+ # Metric weights
167
+ METRIC_WEIGHTS = {
168
+ "AnswerRelevancy": 0.10,
169
+ "Faithfulness": 0.10,
170
+ "GEval": 0.025,
171
+ "BERTScore": 0.20,
172
+ "ROUGE": 0.15,
173
+ "BLEU": 0.025,
174
+ "METEOR": 0.15
175
+ }
176
+
177
+ def normalize_score(metric: str, value: float) -> float:
178
+ """Normalize score to 0-1 scale based on metric's natural range"""
179
+ if metric not in NORMALIZATION_RANGES or not isinstance(value, (int, float)):
180
+ return value
181
 
182
+ min_val, max_val = NORMALIZATION_RANGES[metric]
183
+ # Handle edge cases
184
+ if max_val <= min_val:
185
+ return 0.5 # Default middle value if range is invalid
186
 
187
+ # Normalize and clamp to [0,1]
188
+ normalized = (value - min_val) / (max_val - min_val)
189
+ return max(0.0, min(normalized, 1.0))
190
 
191
+ def calculate_weighted_score(scores: Dict[str, float]) -> float:
192
+ """Calculate weighted average of normalized scores"""
193
+ normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()}
194
+ total_weight = 0
195
+ weighted_sum = 0
 
 
 
196
 
197
+ for metric, weight in METRIC_WEIGHTS.items():
198
+ if metric in normalized_scores:
199
+ weighted_sum += normalized_scores[metric] * weight
200
+ total_weight += weight
201
 
202
+ return weighted_sum / total_weight if total_weight > 0 else 0
203
+
204
+ def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template: str) -> Dict[str, Any]:
205
+ """Evaluate a single text using the selected model and prompt"""
206
+ # Create clean reference text
207
+ reference_text = clean_text(raw_input)
208
 
209
+ # Generate candidate using the selected model and prompt
210
+ prompt = prompt_template.replace("{text}", raw_input)
211
+ candidate = model_provider.generate(prompt)
212
 
213
+ # Clean candidate output for consistent evaluation
214
+ cleaned_candidate = clean_text(candidate)
 
 
215
 
216
+ # Initialize evaluation metrics
217
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
 
 
218
 
219
+ # Calculate traditional metrics
220
  results = {}
221
 
222
+ # BLEU Score
223
  try:
 
224
  smooth = SmoothingFunction().method4
225
+ bleu = sentence_bleu(
226
+ [reference_text.split()],
227
  cleaned_candidate.split(),
228
  smoothing_function=smooth
229
  )
230
+ results["BLEU"] = bleu
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  except Exception as e:
232
+ print(f"BLEU error: {str(e)}")
233
+ results["BLEU"] = 0.0
234
 
235
+ # ROUGE Score
236
  try:
237
+ rouge_scores = scorer.score(reference_text, cleaned_candidate)
238
+ rouge = (rouge_scores['rouge1'].fmeasure +
239
+ rouge_scores['rouge2'].fmeasure +
240
+ rouge_scores['rougeL'].fmeasure) / 3
241
+ results["ROUGE"] = rouge
242
+ except Exception as e:
243
+ print(f"ROUGE error: {str(e)}")
244
+ results["ROUGE"] = 0.0
245
+
246
+ # METEOR Score
247
+ try:
248
+ meteor = meteor_score(
249
+ [reference_text.split()],
250
+ cleaned_candidate.split()
251
  )
252
+ results["METEOR"] = meteor
253
+ except Exception as e:
254
+ print(f"METEOR error: {str(e)}")
255
+ results["METEOR"] = 0.0
256
+
257
+ # BERTScore
258
+ try:
259
+ P, R, F1 = bert_score(
260
+ [cleaned_candidate],
261
+ [reference_text],
262
+ lang="en",
263
+ verbose=False
 
 
 
 
 
 
 
264
  )
265
+ results["BERTScore"] = F1.item()
 
 
266
  except Exception as e:
267
+ print(f"BERTScore error: {str(e)}")
268
+ results["BERTScore"] = 0.7 # Default low value
 
 
 
 
 
 
 
 
 
 
269
 
270
+ # LLM-as-judge metrics - simplified implementation since DeepEval might not be available
271
+ try:
272
+ # Use Gemini as judge if available
273
+ if GEMINI_API_KEY:
274
+ judge_model = GeminiProvider("gemini-1.5-flash-latest")
275
+
276
+ # Answer Relevancy
277
+ relevancy_prompt = f"""
278
+ On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
279
+
280
+ Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
281
+ Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
282
+
283
+ Provide only a single number between 0.0 and 1.0 with no explanation.
284
+ """
285
+ relevancy_response = judge_model.generate(relevancy_prompt)
286
+ try:
287
+ relevancy_score = float(relevancy_response.strip())
288
+ results["AnswerRelevancy"] = max(0.0, min(1.0, relevancy_score))
289
+ except:
290
+ results["AnswerRelevancy"] = 0.5
291
+
292
+ # Faithfulness
293
+ faithfulness_prompt = f"""
294
+ On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
295
+
296
+ Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
297
+ Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
298
+
299
+ Provide only a single number between 0.0 and 1.0 with no explanation.
300
+ """
301
+ faithfulness_response = judge_model.generate(faithfulness_prompt)
302
+ try:
303
+ faithfulness_score = float(faithfulness_response.strip())
304
+ results["Faithfulness"] = max(0.0, min(1.0, faithfulness_score))
305
+ except:
306
+ results["Faithfulness"] = 0.5
307
+
308
+ # GEval
309
+ geval_prompt = f"""
310
+ On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
311
+ Consider accuracy, completeness, fluency, and professionalism.
312
+
313
+ Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
314
+ Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
315
+
316
+ Provide only a single number between 0.0 and 1.0 with no explanation.
317
+ """
318
+ geval_response = judge_model.generate(geval_prompt)
319
+ try:
320
+ geval_score = float(geval_response.strip())
321
+ results["GEval"] = max(0.0, min(1.0, geval_score))
322
+ except:
323
+ results["GEval"] = 0.5
324
+ else:
325
+ # Default values if no judge model available
326
+ results["AnswerRelevancy"] = 0.5
327
+ results["Faithfulness"] = 0.5
328
+ results["GEval"] = 0.5
329
+ except Exception as e:
330
+ print(f"LLM-as-judge error: {str(e)}")
331
+ # Default values if DeepEval fails
332
+ results["AnswerRelevancy"] = 0.5
333
+ results["Faithfulness"] = 0.5
334
+ results["GEval"] = 0.5
335
 
336
+ # Calculate normalized and weighted scores
337
+ normalized_scores = {m: normalize_score(m, v) for m, v in results.items()}
338
+ weighted_score = calculate_weighted_score(results)
339
+
340
+ # Determine interpretation
341
+ if weighted_score >= 0.85:
342
+ interpretation = "Outstanding performance (A) - ready for professional use"
343
+ elif weighted_score >= 0.70:
344
+ interpretation = "Strong performance (B) - good quality with minor improvements"
345
+ elif weighted_score >= 0.50:
346
+ interpretation = "Adequate performance (C) - usable but needs refinement"
347
+ elif weighted_score >= 0.30:
348
+ interpretation = "Weak performance (D) - requires significant revision"
 
 
 
 
 
349
  else:
350
+ interpretation = "Poor performance (F) - likely needs complete rewriting"
351
 
352
+ return {
353
+ "candidate": cleaned_candidate,
354
+ "metrics": results,
355
+ "normalized": normalized_scores,
356
+ "weighted_score": weighted_score,
357
+ "interpretation": interpretation
358
+ }
359
 
360
+ def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str) -> Tuple[str, List[List[str]], str]:
361
+ """Process either input text or uploaded file"""
362
+ if input_text and file_upload:
363
+ return "Please use either text input or file upload, not both.", [], ""
364
 
365
+ if not input_text and not file_upload:
366
+ return "Please provide input text or upload a file.", [], ""
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
+ # Determine model provider
369
+ if model_choice == "Gemini":
370
+ model_provider = GeminiProvider("gemini-1.5-flash-latest")
371
+ elif model_choice == "Llama-3-70b":
372
+ model_provider = GroqProvider("llama3-70b-8192")
373
+ else: # Llama-3-8b
374
+ model_provider = GroqProvider("llama3-8b-8192")
375
 
376
+ # Check if model is available
377
+ if not model_provider.available:
378
+ return f"Error: {model_choice} is not properly configured. Check your API key.", [], ""
379
 
380
+ # Get prompt template
381
+ prompt_template = PROMPT_TEMPLATES[prompt_choice]
382
 
383
+ # Process single text input
384
+ if input_text:
385
+ with gr.Progress() as progress:
386
+ progress(0.1, desc="Starting evaluation...")
387
+ time.sleep(0.2)
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
+ progress(0.3, desc="Generating rewritten content...")
390
+ time.sleep(0.2)
391
+
392
+ progress(0.6, desc="Calculating metrics...")
393
+ result = evaluate_text(input_text, model_provider, prompt_template)
394
+
395
+ progress(0.9, desc="Finalizing results...")
396
+ time.sleep(0.2)
397
+
398
+ # Format metrics for display
399
+ metrics_table = [
400
+ ["Metric", "Raw Score", "Normalized"],
401
+ ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
402
+ ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
403
+ ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
404
+ ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
405
+ ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
406
+ ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
407
+ ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
408
+ ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
409
+ ]
410
+
411
+ return (
412
+ result["candidate"],
413
+ metrics_table,
414
+ f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
415
+ )
416
 
417
+ # Process file upload
418
+ if file_upload:
419
+ with gr.Progress() as progress:
420
+ progress(0.1, desc="Reading file...")
421
+ time.sleep(0.2)
422
+
423
+ # Read the file (assuming CSV with one column of text)
424
+ try:
425
+ df = pd.read_csv(file_upload.name)
426
+ progress(0.3, desc="Processing entries...")
427
+ time.sleep(0.2)
428
+ except Exception as e:
429
+ return f"Error reading file: {str(e)}", [], ""
430
+
431
+ # Assuming the first column contains the text
432
+ text_column = df.columns[0]
433
+ results = []
434
+ detailed_results = []
435
+
436
+ # Process each entry with progress updates
437
+ for i, row in df.iterrows():
438
+ progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
439
+ text = str(row[text_column])
440
+
441
+ try:
442
+ result = evaluate_text(text, model_provider, prompt_template)
443
 
444
+ # Add to results
445
+ results.append(result["weighted_score"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
+ # Store detailed results
448
+ detailed_results.append({
449
+ "input_preview": text[:100] + "..." if len(text) > 100 else text,
450
+ "weighted_score": result["weighted_score"],
451
+ "interpretation": result["interpretation"],
452
+ "candidate": result["candidate"]
453
+ })
454
+ except Exception as e:
455
+ print(f"Error processing entry {i}: {str(e)}")
456
+ results.append(0.0)
457
+ detailed_results.append({
458
+ "input_preview": text[:100] + "..." if len(text) > 100 else text,
459
+ "weighted_score": 0.0,
460
+ "interpretation": "Error processing this entry",
461
+ "candidate": ""
462
+ })
 
 
 
 
 
 
463
 
464
+ progress(0.9, desc="Generating summary...")
465
+ time.sleep(0.2)
466
 
467
+ # Create results dataframe
468
+ results_df = pd.DataFrame(detailed_results)
469
 
470
+ # Generate summary statistics
471
+ valid_scores = [s for s in results if s > 0]
472
+ if valid_scores:
473
+ avg_score = sum(valid_scores) / len(valid_scores)
474
+ min_score = min(valid_scores)
475
+ max_score = max(valid_scores)
476
+
477
+ if avg_score >= 0.85:
478
+ summary = "Excellent performance across inputs"
479
+ elif avg_score >= 0.70:
480
+ summary = "Good performance with room for minor improvements"
481
+ elif avg_score >= 0.50:
482
+ summary = "Adequate performance but needs refinement"
483
+ else:
484
+ summary = "Significant improvements needed"
485
+
486
+ # Format summary
487
+ summary_text = (
488
+ f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
489
+ f"Average Hybrid Score: {avg_score:.4f}\n"
490
+ f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
491
+ f"{summary}"
492
+ )
493
+
494
+ # Create metrics table for summary
495
+ metrics_table = [
496
+ ["Metric", "Value"],
497
+ ["Entries Processed", f"{len(results)}"],
498
+ ["Successful Entries", f"{len(valid_scores)}"],
499
+ ["Average Score", f"{avg_score:.4f}"],
500
+ ["Best Score", f"{max_score:.4f}"],
501
+ ["Worst Score", f"{min_score:.4f}"],
502
+ ["Overall Assessment", summary]
503
+ ]
504
+
505
+ return (
506
+ "Batch processing complete. Use the 'Show Details' button to see individual results.",
507
+ metrics_table,
508
+ summary_text
509
+ )
510
  else:
511
+ return (
512
+ "No successful evaluations. Check your API configuration and input data.",
513
+ [["Error", "All evaluations failed"]],
514
+ "Error: No successful evaluations. Check your API configuration and input data."
515
+ )
516
+
517
+ def show_detailed_results(input_text, file_upload, model_choice, prompt_choice):
518
+ """Show detailed results for batch processing"""
519
+ if not file_upload:
520
+ return "No file uploaded for batch processing."
521
+
522
+ # Read the file
523
+ df = pd.read_csv(file_upload.name)
524
+ text_column = df.columns[0]
525
+
526
+ # Determine model provider
527
+ if model_choice == "Gemini":
528
+ model_provider = GeminiProvider("gemini-1.5-flash-latest")
529
+ elif model_choice == "Llama-3-70b":
530
+ model_provider = GroqProvider("llama3-70b-8192")
531
+ else: # Llama-3-8b
532
+ model_provider = GroqProvider("llama3-8b-8192")
533
+
534
+ # Get prompt template
535
+ prompt_template = PROMPT_TEMPLATES[prompt_choice]
536
+
537
+ # Process each entry
538
+ results = []
539
+ for _, row in df.iterrows():
540
+ text = str(row[text_column])
541
+ try:
542
+ result = evaluate_text(text, model_provider, prompt_template)
543
+ results.append({
544
+ "Input Preview": text[:100] + "..." if len(text) > 100 else text,
545
+ "Weighted Score": f"{result['weighted_score']:.4f}",
546
+ "Interpretation": result['interpretation'],
547
+ "Candidate Text": result['candidate']
548
+ })
549
+ except:
550
+ results.append({
551
+ "Input Preview": text[:100] + "..." if len(text) > 100 else text,
552
+ "Weighted Score": "Error",
553
+ "Interpretation": "Processing error",
554
+ "Candidate Text": ""
555
+ })
556
+
557
+ return gr.Dataframe(value=pd.DataFrame(results))
558
+
559
+ # Create Gradio interface
560
+ with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
561
+ gr.Markdown("# πŸ“Š LLM Evaluation Framework for Professional Content Rewriting")
562
+ gr.Markdown("Evaluate the quality of LLM-generated content using multiple metrics with proper normalization.")
563
+
564
+ with gr.Row():
565
+ with gr.Column(scale=1):
566
+ gr.Markdown("### πŸ“₯ Input Options")
567
+ input_text = gr.Textbox(
568
+ label="Input Text",
569
+ lines=10,
570
+ placeholder="Enter text to evaluate...",
571
+ elem_id="input-text"
572
+ )
573
+ gr.Markdown("or")
574
+ file_upload = gr.File(
575
+ label="Upload CSV file (single column of text)",
576
+ file_types=[".csv", ".txt"],
577
+ elem_id="file-upload"
578
+ )
579
 
580
+ gr.Markdown("### βš™οΈ Configuration")
581
+ model_choice = gr.Radio(
582
+ ["Gemini", "Llama-3-70b", "Llama-3-8b"],
583
+ label="Select Model",
584
+ value="Gemini",
585
+ elem_id="model-choice"
586
+ )
587
+
588
+ prompt_choice = gr.Radio(
589
+ ["Strategic Narrative Architect", "Precision Storyteller"],
590
+ label="Select Prompt Template",
591
+ value="Strategic Narrative Architect",
592
+ elem_id="prompt-choice"
593
+ )
594
+
595
+ submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
 
 
596
 
597
+ with gr.Column(scale=2):
598
+ gr.Markdown("### ✍️ Rewritten Content")
599
+ candidate_output = gr.Textbox(
600
+ label="Rewritten Content",
601
+ lines=15,
602
+ elem_id="candidate-output"
603
+ )
604
+
605
+ gr.Markdown("### πŸ“ˆ Evaluation Metrics")
606
+ metrics_output = gr.Dataframe(
607
+ label="Evaluation Metrics",
608
+ interactive=False,
609
+ elem_id="metrics-output"
610
+ )
611
+
612
+ gr.Markdown("### πŸ“Œ Overall Assessment")
613
+ summary_output = gr.Textbox(
614
+ label="Summary",
615
+ elem_id="summary-output"
616
+ )
617
+
618
+ detailed_results_btn = gr.Button("Show Detailed Results (Batch)", visible=False)
619
+ detailed_results = gr.Dataframe(visible=False)
620
+
621
+ # Update visibility of detailed results button
622
+ def update_detailed_results_visibility(file_upload, summary):
623
+ has_file = file_upload is not None
624
+ has_batch_results = "Processed" in summary and "entries" in summary
625
+ return gr.update(visible=has_file and has_batch_results)
626
+
627
+ # Event handlers
628
+ submit_btn.click(
629
+ fn=process_input,
630
+ inputs=[input_text, file_upload, model_choice, prompt_choice],
631
+ outputs=[candidate_output, metrics_output, summary_output]
632
+ ).then(
633
+ fn=update_detailed_results_visibility,
634
+ inputs=[file_upload, summary_output],
635
+ outputs=detailed_results_btn
636
+ )
637
+
638
+ detailed_results_btn.click(
639
+ fn=show_detailed_results,
640
+ inputs=[input_text, file_upload, model_choice, prompt_choice],
641
+ outputs=detailed_results
642
+ ).then(
643
+ fn=lambda: gr.update(visible=True),
644
+ outputs=detailed_results
645
+ )
646
+
647
+ # Add interpretation guide in an accordion
648
+ with gr.Accordion("πŸ“š Interpretation Guide", open=False):
649
  gr.Markdown("""
650
+ ### Hybrid Score Interpretation
651
 
652
+ The Hybrid Score combines multiple evaluation metrics into a single score with proper normalization:
 
 
 
 
 
653
 
654
+ - **0.85+**: Outstanding performance (A) - ready for professional use
655
+ - **0.70-0.85**: Strong performance (B) - good quality with minor improvements
656
+ - **0.50-0.70**: Adequate performance (C) - usable but needs refinement
657
+ - **0.30-0.50**: Weak performance (D) - requires significant revision
658
+ - **<0.30**: Poor performance (F) - likely needs complete rewriting
659
 
660
+ ### Key Metrics Explained
661
+
662
+ | Metric | What It Measures | Why It Matters |
663
+ |--------|------------------|----------------|
664
+ | **AnswerRelevancy** | Is output on-topic with input? | Does the prompt stay focused despite messy input? |
665
+ | **Faithfulness** | Are ALL facts preserved correctly? | Does it maintain accuracy when input has encoding errors? |
666
+ | **GEval** | Overall quality assessment by another AI | How professional does the output appear? |
667
+ | **BERTScore** | Semantic similarity to reference | How well does it capture the meaning of cleaned text? |
668
+ | **ROUGE** | Content overlap with reference | How much key information is preserved? |
669
+ | **BLEU** | Phrasing precision | How closely does wording match human-quality standard? |
670
+ | **METEOR** | Linguistic quality with synonyms | How natural does the cleaned output read? |
671
  """)
 
 
672
 
673
  # Launch the app
674
  if __name__ == "__main__":
675
+ demo.launch(
676
+ server_name="0.0.0.0",
677
+ server_port=7860,
678
+ share=True
679
+ )