Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline, set_seed | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| import os | |
| import torch | |
| # Check GPU availability (for debugging) | |
| print("CUDA available:", torch.cuda.is_available()) | |
| if torch.cuda.is_available(): | |
| print("GPU Name:", torch.cuda.get_device_name(0)) | |
| else: | |
| print("No GPU detected. Running on CPU.") | |
| # Set seed for reproducibility | |
| set_seed(42) | |
| # Define the six premium generation models: | |
| premium_models = [ | |
| "Qwen/Qwen2.5-Omni-7B", | |
| "Qwen/Qwen2.5-VL-7B-Instruct", | |
| "deepseek-ai/Janus-Pro-7B", | |
| "meta-llama/Llama-2-7b-hf", | |
| "Alibaba-NLP/gte-Qwen2-7B-instruct", | |
| "HuggingFaceH4/zephyr-7b-beta" | |
| ] | |
| # Define five languages: English, German, Spanish, French, Portuguese. | |
| languages = { | |
| "en": "English", | |
| "de": "German", | |
| "es": "Spanish", | |
| "fr": "French", | |
| "pt": "Portuguese" | |
| } | |
| # Define two cost-effective grammar evaluation models: | |
| grammar_model_names = [ | |
| "vennify/t5-base-grammar-correction", | |
| "hassaanik/grammar-correction-model" | |
| ] | |
| # Determine device: Use GPU (0) if available, otherwise CPU (-1) | |
| device = 0 if torch.cuda.is_available() else -1 | |
| # Function to load generation pipelines with appropriate device setting. | |
| def load_generation_pipeline(model_name): | |
| try: | |
| return pipeline("text-generation", model=model_name, device=device) | |
| except Exception as e: | |
| print(f"Error loading generation model {model_name}: {e}") | |
| return None | |
| # Function to load grammar evaluation pipelines with appropriate device setting. | |
| def load_grammar_pipeline(model_name): | |
| try: | |
| return pipeline("text2text-generation", model=model_name, device=device) | |
| except Exception as e: | |
| print(f"Error loading grammar model {model_name}: {e}") | |
| return None | |
| # Pre-load grammar evaluators. | |
| rater_models = [] | |
| for model_name in grammar_model_names: | |
| p = load_grammar_pipeline(model_name) | |
| if p is not None: | |
| rater_models.append(p) | |
| def clean_text(text): | |
| return re.sub(r'[^a-zA-Z0-9]', '', text.lower()) | |
| def is_palindrome(text): | |
| cleaned = clean_text(text) | |
| return cleaned == cleaned[::-1] | |
| # Updated prompt instructs the model to output only the palindrome. | |
| def build_prompt(lang): | |
| return ( | |
| f"Instruction: Generate a single original palindrome in {lang}.\n" | |
| "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n" | |
| "Do not output any additional text or commentary.\n" | |
| "Palindrome: " | |
| ) | |
| def grammar_prompt(pal, lang): | |
| return ( | |
| f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. " | |
| "Return only a number with no explanation.\n\n" | |
| f'"{pal}"\n' | |
| ) | |
| def extract_score(text): | |
| match = re.search(r"\d{1,3}", text) | |
| if match: | |
| score = int(match.group()) | |
| return min(max(score, 0), 100) | |
| return 0 | |
| # Main benchmark function that runs tests and saves CSV results. | |
| def run_benchmark_all(): | |
| results = [] | |
| for model_name in premium_models: | |
| gen_pipeline = load_generation_pipeline(model_name) | |
| if gen_pipeline is None: | |
| continue | |
| for code, lang in languages.items(): | |
| prompt = build_prompt(lang) | |
| try: | |
| gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip() | |
| except Exception as e: | |
| gen_output = f"Error generating text: {e}" | |
| valid = is_palindrome(gen_output) | |
| cleaned_len = len(clean_text(gen_output)) | |
| scores = [] | |
| for rater in rater_models: | |
| rprompt = grammar_prompt(gen_output, lang) | |
| try: | |
| rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text'] | |
| score = extract_score(rtext) | |
| scores.append(score) | |
| except Exception as e: | |
| scores.append(0) | |
| avg_score = np.mean(scores) if scores else 0 | |
| penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5 | |
| final_score = round(cleaned_len * penalty, 2) | |
| results.append({ | |
| "Model": model_name, | |
| "Language": lang, | |
| "Palindrome": gen_output, | |
| "Valid": "✅" if valid else "❌", | |
| "Length": cleaned_len, | |
| "Grammar Score": avg_score, | |
| "Final Score": final_score | |
| }) | |
| df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True) | |
| csv_path = "benchmark_results.csv" | |
| df.to_csv(csv_path, index=False) | |
| print(f"CSV saved to {os.path.abspath(csv_path)}") | |
| return gr.Dataframe(df), csv_path | |
| # Build the Gradio UI using Blocks for a canvas layout. | |
| with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo: | |
| gr.Markdown("# Premium Model Palindrome Benchmark") | |
| gr.Markdown( | |
| "This benchmark runs automatically over 6 premium text-generation models across 5 languages and saves the results " | |
| "to a CSV file upon completion." | |
| ) | |
| with gr.Row(): | |
| run_button = gr.Button("Run All Benchmarks") | |
| output_table = gr.Dataframe(label="Benchmark Results") | |
| output_file = gr.File(label="Download CSV Results") | |
| run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file]) | |
| demo.launch() | |