File size: 27,062 Bytes
96b1e9b
 
 
 
 
 
 
f9128f8
 
 
 
3323e08
f9128f8
 
96b1e9b
 
3323e08
f9128f8
96b1e9b
f9128f8
96b1e9b
 
 
f9128f8
 
3beb0c2
f9128f8
 
 
 
 
 
 
 
 
 
96b1e9b
f9128f8
 
 
 
 
 
 
 
 
 
96b1e9b
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b1e9b
3323e08
 
96b1e9b
 
 
 
 
3323e08
96b1e9b
 
f9128f8
3323e08
f9128f8
 
 
 
 
 
 
96b1e9b
 
f9128f8
 
 
96b1e9b
 
f9128f8
96b1e9b
3323e08
96b1e9b
 
3323e08
 
f9128f8
96b1e9b
 
f9128f8
 
 
3323e08
 
 
 
 
 
f9128f8
3323e08
f9128f8
3323e08
 
96b1e9b
f9128f8
 
 
 
 
 
 
 
 
 
 
 
7517d1f
 
f9128f8
7517d1f
96b1e9b
f9128f8
 
 
 
 
 
 
 
 
 
7517d1f
 
f9128f8
7517d1f
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b1e9b
f9128f8
 
 
 
96b1e9b
f9128f8
 
 
96b1e9b
f9128f8
 
 
 
 
96b1e9b
f9128f8
 
 
 
96b1e9b
f9128f8
 
 
 
 
 
96b1e9b
f9128f8
 
 
96b1e9b
f9128f8
 
96b1e9b
f9128f8
 
96b1e9b
f9128f8
96b1e9b
 
f9128f8
96b1e9b
3323e08
f9128f8
 
3323e08
 
 
f9128f8
96b1e9b
f9128f8
 
96b1e9b
f9128f8
3323e08
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
3323e08
f9128f8
 
 
 
 
 
 
 
 
 
 
 
3323e08
f9128f8
3323e08
f9128f8
 
96b1e9b
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b1e9b
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
3323e08
f9128f8
96b1e9b
f9128f8
 
 
 
 
 
 
3323e08
70bcfc2
 
f9128f8
 
3323e08
f9128f8
 
96b1e9b
f9128f8
 
 
 
 
 
 
96b1e9b
f9128f8
 
 
96b1e9b
f9128f8
 
96b1e9b
f9128f8
 
70bcfc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b1e9b
f9128f8
 
70bcfc2
 
 
 
 
 
 
f9128f8
70bcfc2
 
 
 
 
 
 
 
 
 
 
 
f9128f8
 
70bcfc2
 
 
 
 
 
 
 
 
 
 
 
f9128f8
70bcfc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9128f8
70bcfc2
 
 
 
 
 
 
 
3323e08
70bcfc2
 
 
 
 
 
 
3323e08
70bcfc2
 
 
 
 
 
 
 
 
 
3323e08
70bcfc2
 
 
 
 
 
 
 
 
 
 
f9128f8
70bcfc2
f9128f8
 
 
 
70bcfc2
 
 
f9128f8
 
 
 
70bcfc2
 
 
f9128f8
 
 
 
 
 
 
 
70bcfc2
 
 
f9128f8
 
 
70bcfc2
 
 
f9128f8
 
70bcfc2
 
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70bcfc2
f9128f8
70bcfc2
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3323e08
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b1e9b
f9128f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3323e08
f9128f8
3323e08
f9128f8
3323e08
f9128f8
 
 
 
 
3323e08
f9128f8
 
 
 
 
 
 
 
 
 
 
96b1e9b
 
 
 
f9128f8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
import gradio as gr
import pandas as pd
import numpy as np
import re
import unicodedata
import ftfy
import nltk
import os
import json
import time
from typing import Dict, Any, List, Tuple, Optional
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
import google.generativeai as genai
from groq import Groq
from dotenv import load_dotenv

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Load environment variables
load_dotenv()

# Initialize API clients (with graceful fallback if keys missing)
try:
    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
    if GEMINI_API_KEY:
        genai.configure(api_key=GEMINI_API_KEY)
    else:
        print("Warning: GEMINI_API_KEY not found in environment variables")
except Exception as e:
    print(f"Error configuring Gemini: {str(e)}")
    GEMINI_API_KEY = None

try:
    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
    if GROQ_API_KEY:
        groq_client = Groq(api_key=GROQ_API_KEY)
    else:
        print("Warning: GROQ_API_KEY not found in environment variables")
        groq_client = None
except Exception as e:
    print(f"Error configuring Groq: {str(e)}")
    groq_client = None

# Text cleaning function
def clean_text(text: str) -> str:
    """Clean text by fixing encoding issues and standardizing format"""
    if not isinstance(text, str) or not text.strip():
        return ""
    
    text = ftfy.fix_text(text)  # Fixes encoding artifacts
    text = unicodedata.normalize('NFKD', text)
    # Replace common smart quotes and dashes
    replacements = {
        'Ò€œ': '"', 'Ò€': '"', 'Γ’β‚¬β€œ': '-', 'Ò€”': '--',
        'Ò€’': '*', 'Ò€¦': '...', 'Γ‚': ''
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Normalize whitespace
    return ' '.join(text.split())

# LLM Provider classes
class LLMProvider:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def generate(self, prompt: str) -> str:
        raise NotImplementedError
    
    def get_model_name(self) -> str:
        return self.model_name

class GeminiProvider(LLMProvider):
    def __init__(self, model_name: str = "gemini-1.5-flash-latest"):
        super().__init__(model_name)
        self.available = bool(GEMINI_API_KEY)
        if self.available:
            try:
                self.model = genai.GenerativeModel(model_name)
            except Exception as e:
                print(f"Error initializing Gemini model: {str(e)}")
                self.available = False
    
    def generate(self, prompt: str) -> str:
        if not self.available:
            return "Error: Gemini API not configured properly. Check your API key."
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error generating with Gemini: {str(e)}"

class GroqProvider(LLMProvider):
    def __init__(self, model_name: str = "llama3-70b-8192"):
        super().__init__(model_name)
        self.available = bool(groq_client)
    
    def generate(self, prompt: str) -> str:
        if not self.available:
            return "Error: Groq API not configured properly. Check your API key."
        
        try:
            chat_completion = groq_client.chat.completions.create(
                messages=[
                    {"role": "user", "content": prompt}
                ],
                model=self.model_name,
                temperature=0.3
            )
            return chat_completion.choices[0].message.content
        except Exception as e:
            return f"Error generating with Groq: {str(e)}"

# Prompt templates
PROMPT_TEMPLATES = {
    "Strategic Narrative Architect": """Role: Strategic Narrative Architect
You are a professional content writer with expertise in creating engaging, well-structured narratives.
Your task is to rewrite the following text in a professional, engaging style while preserving all key facts and information:
{text}
Instructions:
1. Maintain all factual information and key details
2. Improve structure and flow for better readability
3. Enhance engagement through appropriate storytelling techniques
4. Use professional language appropriate for the content domain
5. Ensure the output is concise yet comprehensive
6. Begin directly with the content - do NOT include introductory phrases like "Here's a rewritten version" or "Rewritten content"
7. Write as if this is the final published version, not as a response to a rewrite request

Output:""",
    
    "Precision Storyteller": """Role: Precision Storyteller
You are a professional editor focused on accuracy, clarity, and precision.
Your task is to rewrite the following text with maximum factual accuracy while improving clarity:
{text}
Instructions:
1. Preserve all factual information with absolute precision
2. Correct any grammatical errors or awkward phrasing
3. Ensure logical flow and coherence
4. Use clear, concise language without unnecessary embellishment
5. Maintain professional tone appropriate for the content domain
6. Begin directly with the content - do NOT include introductory phrases like "Here's a rewritten version" or "Rewritten content"
7. Write as if this is the final published version, not as a response to a rewrite request

Output:"""
}
# Metric normalization ranges
NORMALIZATION_RANGES = {
    "AnswerRelevancy": (0.0, 1.0),
    "Faithfulness": (0.0, 1.0),
    "GEval": (0.0, 1.0),
    "BERTScore": (0.7, 0.95),
    "ROUGE": (0.0, 0.6),
    "BLEU": (0.0, 0.4),
    "METEOR": (0.0, 0.6)
}

# Metric weights
METRIC_WEIGHTS = {
    "AnswerRelevancy": 0.10,
    "Faithfulness": 0.10,
    "GEval": 0.025,
    "BERTScore": 0.20,
    "ROUGE": 0.15,
    "BLEU": 0.025,
    "METEOR": 0.15
}

def normalize_score(metric: str, value: float) -> float:
    """Normalize score to 0-1 scale based on metric's natural range"""
    if metric not in NORMALIZATION_RANGES or not isinstance(value, (int, float)):
        return value
    
    min_val, max_val = NORMALIZATION_RANGES[metric]
    # Handle edge cases
    if max_val <= min_val:
        return 0.5  # Default middle value if range is invalid
    
    # Normalize and clamp to [0,1]
    normalized = (value - min_val) / (max_val - min_val)
    return max(0.0, min(normalized, 1.0))

def calculate_weighted_score(scores: Dict[str, float]) -> float:
    """Calculate weighted average of normalized scores"""
    normalized_scores = {m: normalize_score(m, v) for m, v in scores.items()}
    total_weight = 0
    weighted_sum = 0
    
    for metric, weight in METRIC_WEIGHTS.items():
        if metric in normalized_scores:
            weighted_sum += normalized_scores[metric] * weight
            total_weight += weight
    
    return weighted_sum / total_weight if total_weight > 0 else 0

def evaluate_text(raw_input: str, model_provider: LLMProvider, prompt_template: str) -> Dict[str, Any]:
    """Evaluate a single text using the selected model and prompt"""
    # Create clean reference text
    reference_text = clean_text(raw_input)
    
    # Generate candidate using the selected model and prompt
    prompt = prompt_template.replace("{text}", raw_input)
    candidate = model_provider.generate(prompt)
    
    # Clean candidate output for consistent evaluation
    cleaned_candidate = clean_text(candidate)
    
    # Initialize evaluation metrics
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate traditional metrics
    results = {}
    
    # BLEU Score
    try:
        smooth = SmoothingFunction().method4
        bleu = sentence_bleu(
            [reference_text.split()], 
            cleaned_candidate.split(), 
            smoothing_function=smooth
        )
        results["BLEU"] = bleu
    except Exception as e:
        print(f"BLEU error: {str(e)}")
        results["BLEU"] = 0.0
    
    # ROUGE Score
    try:
        rouge_scores = scorer.score(reference_text, cleaned_candidate)
        rouge = (rouge_scores['rouge1'].fmeasure + 
                 rouge_scores['rouge2'].fmeasure + 
                 rouge_scores['rougeL'].fmeasure) / 3
        results["ROUGE"] = rouge
    except Exception as e:
        print(f"ROUGE error: {str(e)}")
        results["ROUGE"] = 0.0
    
    # METEOR Score
    try:
        meteor = meteor_score(
            [reference_text.split()], 
            cleaned_candidate.split()
        )
        results["METEOR"] = meteor
    except Exception as e:
        print(f"METEOR error: {str(e)}")
        results["METEOR"] = 0.0
    
    # BERTScore
    try:
        P, R, F1 = bert_score(
            [cleaned_candidate], 
            [reference_text], 
            lang="en", 
            verbose=False
        )
        results["BERTScore"] = F1.item()
    except Exception as e:
        print(f"BERTScore error: {str(e)}")
        results["BERTScore"] = 0.7  # Default low value
    
    # LLM-as-judge metrics - simplified implementation since DeepEval might not be available
    try:
        # Use Gemini as judge if available
        if GEMINI_API_KEY:
            judge_model = GeminiProvider("gemini-1.5-flash-latest")
            
            # Answer Relevancy
            relevancy_prompt = f"""
            On a scale of 0.0 to 1.0, how relevant is the following candidate text to the input?
            
            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
            
            Provide only a single number between 0.0 and 1.0 with no explanation.
            """
            relevancy_response = judge_model.generate(relevancy_prompt)
            try:
                relevancy_score = float(relevancy_response.strip())
                results["AnswerRelevancy"] = max(0.0, min(1.0, relevancy_score))
            except:
                results["AnswerRelevancy"] = 0.5
            
            # Faithfulness
            faithfulness_prompt = f"""
            On a scale of 0.0 to 1.0, how faithful is the candidate text to the original input in terms of factual accuracy?
            
            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
            
            Provide only a single number between 0.0 and 1.0 with no explanation.
            """
            faithfulness_response = judge_model.generate(faithfulness_prompt)
            try:
                faithfulness_score = float(faithfulness_response.strip())
                results["Faithfulness"] = max(0.0, min(1.0, faithfulness_score))
            except:
                results["Faithfulness"] = 0.5
            
            # GEval
            geval_prompt = f"""
            On a scale of 0.0 to 1.0, evaluate the overall quality of the candidate text.
            Consider accuracy, completeness, fluency, and professionalism.
            
            Input: {raw_input[:500]}{'...' if len(raw_input) > 500 else ''}
            Candidate: {cleaned_candidate[:500]}{'...' if len(cleaned_candidate) > 500 else ''}
            
            Provide only a single number between 0.0 and 1.0 with no explanation.
            """
            geval_response = judge_model.generate(geval_prompt)
            try:
                geval_score = float(geval_response.strip())
                results["GEval"] = max(0.0, min(1.0, geval_score))
            except:
                results["GEval"] = 0.5
        else:
            # Default values if no judge model available
            results["AnswerRelevancy"] = 0.5
            results["Faithfulness"] = 0.5
            results["GEval"] = 0.5
    except Exception as e:
        print(f"LLM-as-judge error: {str(e)}")
        # Default values if DeepEval fails
        results["AnswerRelevancy"] = 0.5
        results["Faithfulness"] = 0.5
        results["GEval"] = 0.5
    
    # Calculate normalized and weighted scores
    normalized_scores = {m: normalize_score(m, v) for m, v in results.items()}
    weighted_score = calculate_weighted_score(results)
    
    # Determine interpretation
    if weighted_score >= 0.85:
        interpretation = "Outstanding performance (A) - ready for professional use"
    elif weighted_score >= 0.70:
        interpretation = "Strong performance (B) - good quality with minor improvements"
    elif weighted_score >= 0.50:
        interpretation = "Adequate performance (C) - usable but needs refinement"
    elif weighted_score >= 0.30:
        interpretation = "Weak performance (D) - requires significant revision"
    else:
        interpretation = "Poor performance (F) - likely needs complete rewriting"
    
    return {
        "candidate": cleaned_candidate,
        "metrics": results,
        "normalized": normalized_scores,
        "weighted_score": weighted_score,
        "interpretation": interpretation
    }

def process_input(input_text: str, file_upload, model_choice: str, prompt_choice: str, progress=gr.Progress()) -> Tuple[str, List[List[str]], str]:
    """Process either input text or uploaded file with progress tracking"""
    if input_text and file_upload:
        return "Please use either text input or file upload, not both.", [], ""
    
    if not input_text and not file_upload:
        return "Please provide input text or upload a file.", [], ""
    
    # Determine model provider
    if model_choice == "Gemini":
        model_provider = GeminiProvider("gemini-1.5-flash-latest")
    elif model_choice == "Llama-3-70b":
        model_provider = GroqProvider("llama3-70b-8192")
    else:  # Llama-3-8b
        model_provider = GroqProvider("llama3-8b-8192")
    
    # Check if model is available
    if not model_provider.available:
        return f"Error: {model_choice} is not properly configured. Check your API key.", [], ""
    
    # Get prompt template
    prompt_template = PROMPT_TEMPLATES[prompt_choice]
    
    # Process single text input
    if input_text:
        progress(0.1, desc="Starting evaluation...")
        time.sleep(0.2)
        
        progress(0.3, desc="Generating rewritten content...")
        time.sleep(0.2)
        
        progress(0.6, desc="Calculating metrics...")
        result = evaluate_text(input_text, model_provider, prompt_template)
        
        progress(0.9, desc="Finalizing results...")
        time.sleep(0.2)
        
        # Format metrics for display
        metrics_table = [
            ["Metric", "Raw Score", "Normalized"],
            ["AnswerRelevancy", f"{result['metrics']['AnswerRelevancy']:.4f}", f"{result['normalized']['AnswerRelevancy']:.4f}"],
            ["Faithfulness", f"{result['metrics']['Faithfulness']:.4f}", f"{result['normalized']['Faithfulness']:.4f}"],
            ["GEval", f"{result['metrics']['GEval']:.4f}", f"{result['normalized']['GEval']:.4f}"],
            ["BERTScore", f"{result['metrics']['BERTScore']:.4f}", f"{result['normalized']['BERTScore']:.4f}"],
            ["ROUGE", f"{result['metrics']['ROUGE']:.4f}", f"{result['normalized']['ROUGE']:.4f}"],
            ["BLEU", f"{result['metrics']['BLEU']:.4f}", f"{result['normalized']['BLEU']:.4f}"],
            ["METEOR", f"{result['metrics']['METEOR']:.4f}", f"{result['normalized']['METEOR']:.4f}"],
            ["Weighted Score", f"{result['weighted_score']:.4f}", "N/A"]
        ]
        
        return (
            result["candidate"], 
            metrics_table,
            f"Hybrid Score: {result['weighted_score']:.4f} - {result['interpretation']}"
        )
    
    # Process file upload
    if file_upload:
        progress(0.1, desc="Reading file...")
        time.sleep(0.2)
        
        # Read the file (assuming CSV with one column of text)
        try:
            df = pd.read_csv(file_upload.name)
            progress(0.3, desc="Processing entries...")
            time.sleep(0.2)
        except Exception as e:
            return f"Error reading file: {str(e)}", [], ""
        
        # Assuming the first column contains the text
        text_column = df.columns[0]
        results = []
        detailed_results = []
        
        # Process each entry with progress updates
        for i, row in df.iterrows():
            progress((i + 1) / len(df) * 0.6 + 0.3, desc=f"Processing entry {i+1}/{len(df)}")
            text = str(row[text_column])
            
            try:
                result = evaluate_text(text, model_provider, prompt_template)
                
                # Add to results
                results.append(result["weighted_score"])
                
                # Store detailed results
                detailed_results.append({
                    "input_preview": text[:100] + "..." if len(text) > 100 else text,
                    "weighted_score": result["weighted_score"],
                    "interpretation": result["interpretation"],
                    "candidate": result["candidate"]
                })
            except Exception as e:
                print(f"Error processing entry {i}: {str(e)}")
                results.append(0.0)
                detailed_results.append({
                    "input_preview": text[:100] + "..." if len(text) > 100 else text,
                    "weighted_score": 0.0,
                    "interpretation": "Error processing this entry",
                    "candidate": ""
                })
        
        progress(0.9, desc="Generating summary...")
        time.sleep(0.2)
        
        # Create results dataframe
        results_df = pd.DataFrame(detailed_results)
        
        # Generate summary statistics
        valid_scores = [s for s in results if s > 0]
        if valid_scores:
            avg_score = sum(valid_scores) / len(valid_scores)
            min_score = min(valid_scores)
            max_score = max(valid_scores)
            
            if avg_score >= 0.85:
                summary = "Excellent performance across inputs"
            elif avg_score >= 0.70:
                summary = "Good performance with room for minor improvements"
            elif avg_score >= 0.50:
                summary = "Adequate performance but needs refinement"
            else:
                summary = "Significant improvements needed"
            
            # Format summary
            summary_text = (
                f"Processed {len(results)} entries ({len(valid_scores)} successful)\n"
                f"Average Hybrid Score: {avg_score:.4f}\n"
                f"Range: {min_score:.4f} - {max_score:.4f}\n\n"
                f"{summary}"
            )
            
            # Create metrics table for summary
            metrics_table = [
                ["Metric", "Value"],
                ["Entries Processed", f"{len(results)}"],
                ["Successful Entries", f"{len(valid_scores)}"],
                ["Average Score", f"{avg_score:.4f}"],
                ["Best Score", f"{max_score:.4f}"],
                ["Worst Score", f"{min_score:.4f}"],
                ["Overall Assessment", summary]
            ]
            
            return (
                "Batch processing complete. Use the 'Show Details' button to see individual results.",
                metrics_table,
                summary_text
            )
        else:
            return (
                "No successful evaluations. Check your API configuration and input data.",
                [["Error", "All evaluations failed"]],
                "Error: No successful evaluations. Check your API configuration and input data."
            )

def show_detailed_results(input_text, file_upload, model_choice, prompt_choice, progress=gr.Progress()):
    """Show detailed results for batch processing"""
    if not file_upload:
        return "No file uploaded for batch processing."
    
    progress(0.1, desc="Reading file...")
    time.sleep(0.1)
    
    # Read the file
    df = pd.read_csv(file_upload.name)
    text_column = df.columns[0]
    
    progress(0.3, desc="Determining model provider...")
    time.sleep(0.1)
    
    # Determine model provider
    if model_choice == "Gemini":
        model_provider = GeminiProvider("gemini-1.5-flash-latest")
    elif model_choice == "Llama-3-70b":
        model_provider = GroqProvider("llama3-70b-8192")
    else:  # Llama-3-8b
        model_provider = GroqProvider("llama3-8b-8192")
    
    progress(0.5, desc="Getting prompt template...")
    time.sleep(0.1)
    
    # Get prompt template
    prompt_template = PROMPT_TEMPLATES[prompt_choice]
    
    progress(0.7, desc="Processing entries...")
    time.sleep(0.1)
    
    # Process each entry
    results = []
    for i, row in enumerate(df.iterrows()):
        _, row = row  # Unpack the tuple
        text = str(row[text_column])
        try:
            result = evaluate_text(text, model_provider, prompt_template)
            results.append({
                "Input Preview": text[:100] + "..." if len(text) > 100 else text,
                "Weighted Score": f"{result['weighted_score']:.4f}",
                "Interpretation": result['interpretation'],
                "Candidate Text": result['candidate']
            })
        except:
            results.append({
                "Input Preview": text[:100] + "..." if len(text) > 100 else text,
                "Weighted Score": "Error",
                "Interpretation": "Processing error",
                "Candidate Text": ""
            })
        progress(0.7 + (i + 1) / len(df) * 0.3, desc=f"Processing entry {i+1}/{len(df)}")
    
    progress(1.0, desc="Completed!")
    return gr.Dataframe(value=pd.DataFrame(results))

# Create Gradio interface
with gr.Blocks(title="LLM Evaluation Framework", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ“Š LLM Evaluation Framework for Professional Content Rewriting")
    gr.Markdown("Evaluate the quality of LLM-generated content using multiple metrics with proper normalization.")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### πŸ“₯ Input Options")
            input_text = gr.Textbox(
                label="Input Text", 
                lines=10, 
                placeholder="Enter text to evaluate...",
                elem_id="input-text"
            )
            gr.Markdown("or")
            file_upload = gr.File(
                label="Upload CSV file (single column of text)",
                file_types=[".csv", ".txt"],
                elem_id="file-upload"
            )
            
            gr.Markdown("### βš™οΈ Configuration")
            model_choice = gr.Radio(
                ["Gemini", "Llama-3-70b", "Llama-3-8b"], 
                label="Select Model", 
                value="Gemini",
                elem_id="model-choice"
            )
            
            prompt_choice = gr.Radio(
                ["Strategic Narrative Architect", "Precision Storyteller"],
                label="Select Prompt Template",
                value="Strategic Narrative Architect",
                elem_id="prompt-choice"
            )
            
            submit_btn = gr.Button("Evaluate", variant="primary", size="lg", elem_id="submit-btn")
        
        with gr.Column(scale=2):
            gr.Markdown("### ✍️ Rewritten Content")
            candidate_output = gr.Textbox(
                label="Rewritten Content", 
                lines=15,
                elem_id="candidate-output"
            )
            
            gr.Markdown("### πŸ“ˆ Evaluation Metrics")
            metrics_output = gr.Dataframe(
                label="Evaluation Metrics",
                interactive=False,
                elem_id="metrics-output"
            )
            
            gr.Markdown("### πŸ“Œ Overall Assessment")
            summary_output = gr.Textbox(
                label="Summary",
                elem_id="summary-output"
            )
            
            detailed_results_btn = gr.Button("Show Detailed Results (Batch)", visible=False)
            detailed_results = gr.Dataframe(visible=False)
    
    # Update visibility of detailed results button
    def update_detailed_results_visibility(file_upload, summary):
        has_file = file_upload is not None
        has_batch_results = "Processed" in summary and "entries" in summary
        return gr.update(visible=has_file and has_batch_results)
    
    # Event handlers
    submit_btn.click(
        fn=process_input,
        inputs=[input_text, file_upload, model_choice, prompt_choice],
        outputs=[candidate_output, metrics_output, summary_output]
    ).then(
        fn=update_detailed_results_visibility,
        inputs=[file_upload, summary_output],
        outputs=detailed_results_btn
    )
    
    detailed_results_btn.click(
        fn=show_detailed_results,
        inputs=[input_text, file_upload, model_choice, prompt_choice],
        outputs=detailed_results
    ).then(
        fn=lambda: gr.update(visible=True),
        outputs=detailed_results
    )
    
    # Add interpretation guide in an accordion
    with gr.Accordion("πŸ“š Interpretation Guide", open=False):
        gr.Markdown("""
        ### Hybrid Score Interpretation
        
        The Hybrid Score combines multiple evaluation metrics into a single score with proper normalization:
        
        - **0.85+**: Outstanding performance (A) - ready for professional use
        - **0.70-0.85**: Strong performance (B) - good quality with minor improvements
        - **0.50-0.70**: Adequate performance (C) - usable but needs refinement
        - **0.30-0.50**: Weak performance (D) - requires significant revision
        - **<0.30**: Poor performance (F) - likely needs complete rewriting
        
        ### Key Metrics Explained
        
        | Metric | What It Measures | Why It Matters |
        |--------|------------------|----------------|
        | **AnswerRelevancy** | Is output on-topic with input? | Does the prompt stay focused despite messy input? |
        | **Faithfulness** | Are ALL facts preserved correctly? | Does it maintain accuracy when input has encoding errors? |
        | **GEval** | Overall quality assessment by another AI | How professional does the output appear? |
        | **BERTScore** | Semantic similarity to reference | How well does it capture the meaning of cleaned text? |
        | **ROUGE** | Content overlap with reference | How much key information is preserved? |
        | **BLEU** | Phrasing precision | How closely does wording match human-quality standard? |
        | **METEOR** | Linguistic quality with synonyms | How natural does the cleaned output read? |
        """)

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )