Sa-m commited on
Commit
96b1e9b
·
verified ·
1 Parent(s): 79e1931

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +715 -0
app.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import unicodedata
6
+ from typing import Dict, Tuple, List
7
+ import ftfy
8
+ import nltk
9
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
10
+ from rouge_score import rouge_scorer
11
+ from nltk.translate.meteor_score import meteor_score
12
+ from bert_score import score as bert_score
13
+ from deepeval.test_case import LLMTestCase
14
+ from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, GEval
15
+ from deepeval.models import DeepEvalBaseLLM
16
+ import google.generativeai as genai
17
+ import tempfile
18
+ import os
19
+ from pathlib import Path
20
+ import logging
21
+
22
+ # Download required NLTK data
23
+ nltk.download('punkt', quiet=True)
24
+ nltk.download('wordnet', quiet=True)
25
+
26
+ # Configure logging
27
+ logging.basicConfig(level=logging.INFO)
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Global variables for API keys (in production, use environment variables)
31
+ GEMINI_API_KEY = None # Will be set from user input
32
+ CONFIDENT_API_KEY = None # Will be set from user input
33
+
34
+ class LLMProvider:
35
+ """Abstract base class for LLM providers"""
36
+ def __init__(self, model):
37
+ self.model = model
38
+
39
+ def generate(self, prompt: str) -> str:
40
+ raise NotImplementedError
41
+
42
+ def get_model_name(self) -> str:
43
+ raise NotImplementedError
44
+
45
+ class GeminiProvider(LLMProvider):
46
+ """Gemini implementation"""
47
+ def __init__(self, model_name="gemini-1.5-flash"):
48
+ self.model_name = model_name
49
+ genai.configure(api_key=GEMINI_API_KEY)
50
+ self.model = genai.GenerativeModel(model_name)
51
+
52
+ def generate(self, prompt: str) -> str:
53
+ try:
54
+ response = self.model.generate_content(prompt)
55
+ return response.text.strip()
56
+ except Exception as e:
57
+ logger.error(f"Error generating content with Gemini: {e}")
58
+ return f"Error: {str(e)}"
59
+
60
+ def get_model_name(self) -> str:
61
+ return self.model_name
62
+
63
+ class GroqProvider(LLMProvider):
64
+ """Placeholder for Groq implementation"""
65
+ def __init__(self, model_name="llama3-70b-8192"):
66
+ self.model_name = model_name
67
+ # Implementation would go here
68
+ pass
69
+
70
+ def generate(self, prompt: str) -> str:
71
+ return "Groq implementation not available"
72
+
73
+ def get_model_name(self) -> str:
74
+ return self.model_name
75
+
76
+ class GeminiLLM(DeepEvalBaseLLM):
77
+ """Wrapper for Gemini to work with DeepEval"""
78
+ def __init__(self, model):
79
+ self.model = model
80
+
81
+ def load_model(self):
82
+ return self.model
83
+
84
+ def generate(self, prompt: str) -> str:
85
+ return self.model.generate_content(prompt).text.strip()
86
+
87
+ async def a_generate(self, prompt: str) -> str:
88
+ return self.model.generate_content(prompt).text.strip()
89
+
90
+ def get_model_name(self) -> str:
91
+ return "gemini-pro"
92
+
93
+ def clean_text(text: str) -> str:
94
+ """
95
+ Clean text by fixing encoding artifacts and normalizing characters.
96
+
97
+ Args:
98
+ text (str): Input text to clean
99
+
100
+ Returns:
101
+ str: Cleaned text
102
+ """
103
+ if not text or not isinstance(text, str):
104
+ return ""
105
+
106
+ # Fix common encoding artifacts
107
+ text = ftfy.fix_text(text)
108
+ text = unicodedata.normalize('NFKD', text)
109
+
110
+ # Replace smart quotes with standard ASCII quotes
111
+ text = text.replace('“', '"').replace('”', '"')
112
+ text = text.replace("‘", "'").replace("’", "'")
113
+
114
+ # Remove non-ASCII characters (optional, can be toggled)
115
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
116
+
117
+ # Normalize whitespace
118
+ text = ' '.join(text.split())
119
+
120
+ return text
121
+
122
+ def create_prompts() -> Dict[str, str]:
123
+ """
124
+ Create different prompt variants for testing.
125
+
126
+ Returns:
127
+ Dict[str, str]: Dictionary of prompt names and their text
128
+ """
129
+ prompts = {
130
+ "Strategic Narrative Architect": """Role: Strategic Narrative Architect
131
+ You are a professional content writer who transforms raw text into engaging, well-structured narratives.
132
+ Your goal is to rewrite the following text while preserving all key facts and statistics, but enhancing:
133
+ - Structure and flow
134
+ - Engagement and readability
135
+ - Professional tone
136
+ - Strategic storytelling
137
+
138
+ Guidelines:
139
+ 1. Maintain all factual information and numerical data
140
+ 2. Improve sentence structure for better readability
141
+ 3. Use active voice where appropriate
142
+ 4. Ensure professional tone suitable for publication
143
+ 5. Add logical transitions between ideas
144
+ 6. Keep the length similar to the original
145
+
146
+ Rewrite the following text:
147
+ {input_text}""",
148
+
149
+ "Precision Storyteller": """Role: Precision Storyteller
150
+ You are a meticulous editor who ensures factual accuracy and clarity in all content.
151
+ Your goal is to rewrite the following text with maximum precision while maintaining:
152
+ - Factual accuracy above all
153
+ - Clarity and conciseness
154
+ - Proper grammar and punctuation
155
+ - Consistent terminology
156
+
157
+ Guidelines:
158
+ 1. Preserve every fact, statistic, and detail from the original
159
+ 2. Correct any grammatical errors or awkward phrasing
160
+ 3. Use precise, unambiguous language
161
+ 4. Avoid embellishment or subjective interpretation
162
+ 5. Maintain neutral, professional tone
163
+ 6. Ensure all claims are supported by the original text
164
+
165
+ Rewrite the following text:
166
+ {input_text}"""
167
+ }
168
+
169
+ return prompts
170
+
171
+ def evaluate_text(input_text: str, candidate_text: str, reference_text: str,
172
+ judge_model) -> Dict[str, float]:
173
+ """
174
+ Evaluate the quality of a rewritten text using multiple metrics.
175
+
176
+ Args:
177
+ input_text (str): Original raw input text
178
+ candidate_text (str): Generated candidate text
179
+ reference_text (str): Cleaned reference text
180
+ judge_model: Model for LLM-as-judge metrics
181
+
182
+ Returns:
183
+ Dict[str, float]: Dictionary of metric scores
184
+ """
185
+ results = {}
186
+
187
+ try:
188
+ # Initialize scorers
189
+ bleu_scorer = SmoothingFunction().method4
190
+ rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
191
+
192
+ # Tokenize for BLEU and METEOR
193
+ reference_tokens = reference_text.split()
194
+ candidate_tokens = candidate_text.split()
195
+
196
+ # BLEU Score
197
+ try:
198
+ bleu_score_val = sentence_bleu([reference_tokens], candidate_tokens,
199
+ smoothing_function=bleu_scorer)
200
+ results["BLEU"] = bleu_score_val
201
+ except Exception as e:
202
+ logger.warning(f"BLEU calculation failed: {e}")
203
+ results["BLEU"] = 0.0
204
+
205
+ # ROUGE Score
206
+ try:
207
+ rouge_scores = rouge_scorer_obj.score(reference_text, candidate_text)
208
+ # Average of ROUGE-1, ROUGE-2, and ROUGE-L F1 scores
209
+ rouge_avg = (rouge_scores['rouge1'].fmeasure +
210
+ rouge_scores['rouge2'].fmeasure +
211
+ rouge_scores['rougeL'].fmeasure) / 3
212
+ results["ROUGE"] = rouge_avg
213
+ except Exception as e:
214
+ logger.warning(f"ROUGE calculation failed: {e}")
215
+ results["ROUGE"] = 0.0
216
+
217
+ # METEOR Score
218
+ try:
219
+ meteor_score_val = meteor_score([reference_tokens], candidate_tokens)
220
+ results["METEOR"] = meteor_score_val
221
+ except Exception as e:
222
+ logger.warning(f"METEOR calculation failed: {e}")
223
+ results["METEOR"] = 0.0
224
+
225
+ # BERTScore
226
+ try:
227
+ P, R, F1 = bert_score([candidate_text], [reference_text], lang="en", verbose=False)
228
+ results["BERTScore"] = F1.item()
229
+ except Exception as e:
230
+ logger.warning(f"BERTScore calculation failed: {e}")
231
+ results["BERTScore"] = 0.0
232
+
233
+ # LLM-as-judge metrics
234
+ try:
235
+ test_case = LLMTestCase(
236
+ input=input_text,
237
+ actual_output=candidate_text,
238
+ expected_output=reference_text,
239
+ retrieval_context=[reference_text]
240
+ )
241
+
242
+ # Answer Relevancy
243
+ answer_rel = AnswerRelevancyMetric(model=judge_model)
244
+ answer_rel.measure(test_case)
245
+ results["AnswerRelevancy"] = answer_rel.score
246
+
247
+ # Faithfulness
248
+ faith = FaithfulnessMetric(model=judge_model)
249
+ faith.measure(test_case)
250
+ results["Faithfulness"] = faith.score
251
+
252
+ # GEval
253
+ geval = GEval(
254
+ name="OverallQuality",
255
+ criteria="Evaluate if the candidate response is accurate, complete, and well-written.",
256
+ evaluation_params=[
257
+ "input",
258
+ "actual_output",
259
+ "expected_output"
260
+ ],
261
+ model=judge_model,
262
+ strict_mode=False
263
+ )
264
+ geval.measure(test_case)
265
+ results["GEval"] = geval.score
266
+
267
+ except Exception as e:
268
+ logger.warning(f"LLM-as-judge metrics failed: {e}")
269
+ # Set default values if LLM-as-judge fails
270
+ results["AnswerRelevancy"] = 0.5
271
+ results["Faithfulness"] = 0.5
272
+ results["GEval"] = 0.5
273
+
274
+ except Exception as e:
275
+ logger.error(f"Error in evaluation: {e}")
276
+ # Return default scores if everything fails
277
+ default_metrics = ["BLEU", "ROUGE", "METEOR", "BERTScore",
278
+ "AnswerRelevancy", "Faithfulness", "GEval"]
279
+ for metric in default_metrics:
280
+ results[metric] = 0.0
281
+
282
+ return results
283
+
284
+ def normalize_score(metric: str, value: float) -> float:
285
+ """
286
+ Normalize score to 0-1 scale based on metric's natural range.
287
+
288
+ Args:
289
+ metric (str): Name of the metric
290
+ value (float): Raw score value
291
+
292
+ Returns:
293
+ float: Normalized score between 0 and 1
294
+ """
295
+ # Define natural ranges for each metric
296
+ normalization_ranges = {
297
+ "AnswerRelevancy": (0.0, 1.0),
298
+ "Faithfulness": (0.0, 1.0),
299
+ "GEval": (0.0, 1.0),
300
+ "BERTScore": (0.7, 0.95),
301
+ "ROUGE": (0.0, 0.6),
302
+ "BLEU": (0.0, 0.4),
303
+ "METEOR": (0.0, 0.6)
304
+ }
305
+
306
+ if metric not in normalization_ranges or not isinstance(value, (int, float)):
307
+ return value
308
+
309
+ min_val, max_val = normalization_ranges[metric]
310
+
311
+ # Handle edge cases
312
+ if max_val <= min_val:
313
+ return 0.5 # Default middle value if range is invalid
314
+
315
+ # Normalize and clamp to [0,1]
316
+ normalized = (value - min_val) / (max_val - min_val)
317
+ return max(0.0, min(normalized, 1.0))
318
+
319
+ def calculate_weighted_score(scores: Dict[str, float]) -> float:
320
+ """
321
+ Calculate weighted average of normalized scores.
322
+
323
+ Args:
324
+ scores (Dict[str, float]): Dictionary of metric scores
325
+
326
+ Returns:
327
+ float: Weighted average score
328
+ """
329
+ # Define weights for each metric
330
+ weights = {
331
+ "AnswerRelevancy": 0.10,
332
+ "Faithfulness": 0.10,
333
+ "GEval": 0.025,
334
+ "BERTScore": 0.20,
335
+ "ROUGE": 0.15,
336
+ "BLEU": 0.025,
337
+ "METEOR": 0.15
338
+ }
339
+
340
+ normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()}
341
+ total_weight = 0
342
+ weighted_sum = 0
343
+
344
+ for metric, weight in weights.items():
345
+ if metric in normalized_scores:
346
+ weighted_sum += normalized_scores[metric] * weight
347
+ total_weight += weight
348
+
349
+ return weighted_sum / total_weight if total_weight > 0 else 0.0
350
+
351
+ def process_single_text(input_text: str, gemini_api_key: str,
352
+ confident_api_key: str, progress=gr.Progress()) -> Tuple[Dict, List[Dict]]:
353
+ """
354
+ Process a single text input and return evaluation results.
355
+
356
+ Args:
357
+ input_text (str): Input text to evaluate
358
+ gemini_api_key (str): Gemini API key
359
+ confident_api_key (str): Confident API key for DeepEval
360
+ progress: Gradio progress tracker
361
+
362
+ Returns:
363
+ Tuple[Dict, List[Dict]]: Summary results and detailed results for each prompt
364
+ """
365
+ global GEMINI_API_KEY, CONFIDENT_API_KEY
366
+
367
+ # Set API keys
368
+ GEMINI_API_KEY = gemini_api_key
369
+ CONFIDENT_API_KEY = confident_api_key
370
+
371
+ if not input_text or not input_text.strip():
372
+ return {"error": "Please provide valid input text"}, []
373
+
374
+ try:
375
+ # Clean the input text to create reference
376
+ progress(0.1, "Cleaning input text...")
377
+ reference_text = clean_text(input_text)
378
+
379
+ if not reference_text:
380
+ return {"error": "Could not process the input text"}, []
381
+
382
+ # Initialize Gemini model
383
+ progress(0.2, "Initializing Gemini model...")
384
+ try:
385
+ genai.configure(api_key=GEMINI_API_KEY)
386
+ gemini_model = genai.GenerativeModel("gemini-1.5-flash")
387
+ judge = GeminiLLM(gemini_model)
388
+ except Exception as e:
389
+ return {"error": f"Failed to initialize Gemini: {str(e)}"}, []
390
+
391
+ # Get prompts
392
+ progress(0.3, "Generating candidate texts...")
393
+ prompts = create_prompts()
394
+
395
+ detailed_results = []
396
+
397
+ # Process each prompt
398
+ for prompt_name, prompt_template in prompts.items():
399
+ progress(0.3 + 0.6 * (list(prompts.keys()).index(prompt_name) / len(prompts)),
400
+ f"Processing {prompt_name}...")
401
+
402
+ # Generate candidate
403
+ full_prompt = prompt_template.format(input_text=input_text)
404
+ candidate_text = gemini_model.generate_content(full_prompt).text.strip()
405
+
406
+ # Clean candidate text
407
+ cleaned_candidate = clean_text(candidate_text)
408
+
409
+ # Evaluate
410
+ scores = evaluate_text(input_text, cleaned_candidate, reference_text, judge)
411
+
412
+ # Calculate hybrid scores
413
+ hybrid_avg = np.mean(list(scores.values()))
414
+ weighted_avg = calculate_weighted_score(scores)
415
+
416
+ # Add interpretation
417
+ if weighted_avg >= 0.85:
418
+ interpretation = "Outstanding performance (A) - ready for professional use"
419
+ elif weighted_avg >= 0.70:
420
+ interpretation = "Strong performance (B) - good quality with minor improvements"
421
+ elif weighted_avg >= 0.50:
422
+ interpretation = "Adequate performance (C) - usable but needs refinement"
423
+ elif weighted_avg >= 0.30:
424
+ interpretation = "Weak performance (D) - requires significant revision"
425
+ else:
426
+ interpretation = "Poor performance (F) - likely needs complete rewriting"
427
+
428
+ detailed_results.append({
429
+ "Prompt": prompt_name,
430
+ "Original Input": input_text[:500] + "..." if len(input_text) > 500 else input_text,
431
+ "Reference Text": reference_text[:500] + "..." if len(reference_text) > 500 else reference_text,
432
+ "Candidate Text": cleaned_candidate,
433
+ "Scores": scores,
434
+ "Hybrid Average": hybrid_avg,
435
+ "Weighted Average": weighted_avg,
436
+ "Interpretation": interpretation
437
+ })
438
+
439
+ # Create summary
440
+ summary = {
441
+ "Total Prompts Evaluated": len(detailed_results),
442
+ "Best Performing Prompt": max(detailed_results, key=lambda x: x["Weighted Average"])["Prompt"],
443
+ "Highest Weighted Score": max(detailed_results, key=lambda x: x["Weighted Average"])["Weighted Average"],
444
+ "Lowest Weighted Score": min(detailed_results, key=lambda x: x["Weighted Average"])["Weighted Average"]
445
+ }
446
+
447
+ progress(1.0, "Processing complete!")
448
+ return summary, detailed_results
449
+
450
+ except Exception as e:
451
+ logger.error(f"Error processing text: {e}")
452
+ return {"error": f"Processing failed: {str(e)}"}, []
453
+
454
+ def process_uploaded_file(file_path: str, gemini_api_key: str,
455
+ confident_api_key: str, progress=gr.Progress()) -> Tuple[Dict, List[Dict]]:
456
+ """
457
+ Process an uploaded CSV/Excel file containing texts to evaluate.
458
+
459
+ Args:
460
+ file_path (str): Path to uploaded file
461
+ gemini_api_key (str): Gemini API key
462
+ confident_api_key (str): Confident API key for DeepEval
463
+ progress: Gradio progress tracker
464
+
465
+ Returns:
466
+ Tuple[Dict, List[Dict]]: Summary results and detailed results
467
+ """
468
+ try:
469
+ # Read file based on extension
470
+ file_ext = Path(file_path).suffix.lower()
471
+
472
+ if file_ext in ['.csv']:
473
+ df = pd.read_csv(file_path)
474
+ elif file_ext in ['.xls', '.xlsx']:
475
+ df = pd.read_excel(file_path)
476
+ else:
477
+ return {"error": "Unsupported file format. Please upload CSV or Excel file."}, []
478
+
479
+ if df.empty:
480
+ return {"error": "File is empty"}, []
481
+
482
+ # Look for text column (case-insensitive)
483
+ text_column = None
484
+ for col in df.columns:
485
+ if 'text' in col.lower() or 'content' in col.lower() or 'article' in col.lower():
486
+ text_column = col
487
+ break
488
+
489
+ if not text_column:
490
+ # Use first column if no text-like column found
491
+ text_column = df.columns[0]
492
+
493
+ texts = df[text_column].dropna().astype(str).tolist()
494
+
495
+ if not texts:
496
+ return {"error": "No valid text data found in the file"}, []
497
+
498
+ all_results = []
499
+ summaries = []
500
+
501
+ # Process each text
502
+ for i, text in enumerate(texts):
503
+ progress(i / len(texts), f"Processing text {i+1} of {len(texts)}...")
504
+ summary, details = process_single_text(text, gemini_api_key, confident_api_key)
505
+ if "error" not in summary:
506
+ summaries.append(summary)
507
+ all_results.extend(details)
508
+
509
+ if not all_results:
510
+ return {"error": "Failed to process any texts"}, []
511
+
512
+ # Create overall summary
513
+ overall_summary = {
514
+ "Total Files Processed": len(texts),
515
+ "Total Prompts Evaluated": len(all_results),
516
+ "Average Weighted Score": np.mean([r["Weighted Average"] for r in all_results]),
517
+ "Best Performing Prompt": pd.DataFrame(all_results)["Prompt"].mode()[0]
518
+ if len(all_results) > 0 else "N/A"
519
+ }
520
+
521
+ progress(1.0, "Batch processing complete!")
522
+ return overall_summary, all_results
523
+
524
+ except Exception as e:
525
+ logger.error(f"Error processing file: {e}")
526
+ return {"error": f"File processing failed: {str(e)}"}, []
527
+
528
+ def create_gradio_interface():
529
+ """Create the Gradio interface."""
530
+
531
+ with gr.Blocks(title="LLM Evaluation Framework") as demo:
532
+ gr.Markdown("# 📊 LLM Evaluation Framework for Content Rewriting")
533
+ gr.Markdown("Evaluate and compare different prompts for professional content rewriting tasks.")
534
+
535
+ with gr.Tabs():
536
+ with gr.Tab("Single Text Evaluation"):
537
+ gr.Markdown("### Evaluate a single piece of text")
538
+
539
+ with gr.Row():
540
+ with gr.Column(scale=2):
541
+ input_text = gr.Textbox(
542
+ label="Input Text",
543
+ placeholder="Paste your text here...",
544
+ lines=10
545
+ )
546
+
547
+ with gr.Column(scale=1):
548
+ gemini_api_key = gr.Textbox(
549
+ label="Gemini API Key",
550
+ placeholder="Enter your Gemini API key",
551
+ type="password"
552
+ )
553
+ confident_api_key = gr.Textbox(
554
+ label="Confident API Key (for DeepEval)",
555
+ placeholder="Enter your Confident API key",
556
+ type="password"
557
+ )
558
+ evaluate_btn = gr.Button("Evaluate Text", variant="primary")
559
+
560
+ gr.Markdown("### Results")
561
+ with gr.Row():
562
+ with gr.Column():
563
+ summary_output = gr.JSON(label="Summary Results")
564
+
565
+ with gr.Column():
566
+ detailed_output = gr.Dataframe(
567
+ label="Detailed Results",
568
+ headers=["Prompt", "Weighted Average", "Interpretation"],
569
+ datatype=["str", "number", "str"]
570
+ )
571
+
572
+ # Hidden outputs for detailed data
573
+ hidden_detailed_results = gr.State()
574
+
575
+ def update_outputs(text, gemini_key, confident_key):
576
+ if not text.strip():
577
+ return {"error": "Please enter text"}, None, None
578
+
579
+ summary, detailed = process_single_text(text, gemini_key, confident_key)
580
+
581
+ if "error" in summary:
582
+ return summary, None, None
583
+
584
+ # Prepare dataframe data
585
+ df_data = []
586
+ for result in detailed:
587
+ df_data.append([
588
+ result["Prompt"],
589
+ round(result["Weighted Average"], 3),
590
+ result["Interpretation"]
591
+ ])
592
+
593
+ return summary, df_data, detailed
594
+
595
+ evaluate_btn.click(
596
+ fn=update_outputs,
597
+ inputs=[input_text, gemini_api_key, confident_api_key],
598
+ outputs=[summary_output, detailed_output, hidden_detailed_results]
599
+ )
600
+
601
+ # Button to show full candidate texts
602
+ with gr.Row():
603
+ show_details_btn = gr.Button("Show Full Results with Candidate Texts")
604
+
605
+ full_results_output = gr.JSON(label="Full Detailed Results", visible=False)
606
+
607
+ def show_full_results(detailed_results):
608
+ if detailed_results is None:
609
+ return {"error": "No results to display"}
610
+ return detailed_results
611
+
612
+ show_details_btn.click(
613
+ fn=show_full_results,
614
+ inputs=[hidden_detailed_results],
615
+ outputs=[full_results_output]
616
+ )
617
+
618
+ with gr.Tab("Batch File Evaluation"):
619
+ gr.Markdown("### Evaluate multiple texts from a file")
620
+
621
+ with gr.Row():
622
+ with gr.Column():
623
+ file_input = gr.File(
624
+ label="Upload CSV or Excel file",
625
+ file_types=['.csv', '.xls', '.xlsx']
626
+ )
627
+
628
+ with gr.Column():
629
+ batch_gemini_key = gr.Textbox(
630
+ label="Gemini API Key",
631
+ placeholder="Enter your Gemini API key",
632
+ type="password"
633
+ )
634
+ batch_confident_key = gr.Textbox(
635
+ label="Confident API Key (for DeepEval)",
636
+ placeholder="Enter your Confident API key",
637
+ type="password"
638
+ )
639
+ batch_evaluate_btn = gr.Button("Process File", variant="primary")
640
+
641
+ gr.Markdown("### Batch Results")
642
+ batch_summary_output = gr.JSON(label="Batch Summary Results")
643
+ batch_detailed_output = gr.Dataframe(
644
+ label="Detailed Results",
645
+ headers=["Prompt", "Weighted Average", "Interpretation"],
646
+ datatype=["str", "number", "str"]
647
+ )
648
+
649
+ # Hidden state for batch results
650
+ hidden_batch_results = gr.State()
651
+
652
+ def process_file(file, gemini_key, confident_key):
653
+ if file is None:
654
+ return {"error": "Please upload a file"}, None, None
655
+
656
+ summary, detailed = process_uploaded_file(file.name, gemini_key, confident_key)
657
+
658
+ if "error" in summary:
659
+ return summary, None, None
660
+
661
+ # Prepare dataframe data
662
+ df_data = []
663
+ for result in detailed:
664
+ df_data.append([
665
+ result["Prompt"],
666
+ round(result["Weighted Average"], 3),
667
+ result["Interpretation"]
668
+ ])
669
+
670
+ return summary, df_data, detailed
671
+
672
+ batch_evaluate_btn.click(
673
+ fn=process_file,
674
+ inputs=[file_input, batch_gemini_key, batch_confident_key],
675
+ outputs=[batch_summary_output, batch_detailed_output, hidden_batch_results]
676
+ )
677
+
678
+ # Button to show full batch results
679
+ show_batch_details_btn = gr.Button("Show Full Batch Results")
680
+ batch_full_results_output = gr.JSON(label="Full Batch Results", visible=False)
681
+
682
+ show_batch_details_btn.click(
683
+ fn=show_full_results,
684
+ inputs=[hidden_batch_results],
685
+ outputs=[batch_full_results_output]
686
+ )
687
+
688
+ gr.Markdown("""
689
+ ## How to Use
690
+
691
+ 1. **Single Text Evaluation**:
692
+ - Enter your text in the input box
693
+ - Provide your API keys
694
+ - Click "Evaluate Text" to see results
695
+
696
+ 2. **Batch File Evaluation**:
697
+ - Upload a CSV or Excel file with a column containing text
698
+ - Provide your API keys
699
+ - Click "Process File" to evaluate all texts
700
+
701
+ ### API Keys
702
+ - **Gemini API Key**: Get from Google AI Studio
703
+ - **Confident API Key**: Get from DeepEval dashboard
704
+
705
+ ### Interpreting Results
706
+ - **Weighted Average**: Our primary metric combining all evaluations
707
+ - **Interpretation**: Performance grade based on weighted score
708
+ """)
709
+
710
+ return demo
711
+
712
+ # Launch the app
713
+ if __name__ == "__main__":
714
+ app = create_gradio_interface()
715
+ app.launch(debug=True)