Spaces:
Running
Running
| import gradio as gr | |
| import yaml | |
| import json | |
| import os | |
| from typing import Dict, List, Any, Tuple | |
| from datetime import datetime | |
| class AIEvaluationForm: | |
| def __init__(self, template_file: str = "questions.yaml"): | |
| """Initialize the evaluation form with questions from YAML file""" | |
| self.template_file = template_file | |
| self.template = self.load_template() | |
| self.components = {} | |
| def load_template(self) -> Dict: | |
| """Load evaluation template from YAML file""" | |
| try: | |
| with open(self.template_file, 'r', encoding='utf-8') as f: | |
| return yaml.safe_load(f) | |
| except FileNotFoundError: | |
| raise FileNotFoundError(f"Template file '{self.template_file}' not found. Please ensure the file exists.") | |
| except yaml.YAMLError as e: | |
| raise ValueError(f"Error parsing YAML file: {e}") | |
| def create_system_info_section(self) -> Tuple[List, Dict]: | |
| """Create the system information section""" | |
| components = {} | |
| with gr.Group(): | |
| gr.Markdown("## π AI System Information") | |
| gr.Markdown("*Please provide basic information about the AI system being evaluated.*") | |
| components['name'] = gr.Textbox( | |
| label="AI System Name", | |
| placeholder="e.g., GPT-4, BERT, StarCoder2", | |
| info="The official name of your AI system" | |
| ) | |
| components['provider'] = gr.Textbox( | |
| label="Provider/Organization", | |
| placeholder="e.g., OpenAI, Google, BigCode", | |
| info="The organization that developed the system" | |
| ) | |
| components['model_tag'] = gr.Textbox( | |
| label="Model Tag/Version", | |
| placeholder="e.g., gpt-4.1-2025-04-14 for GPT-4.1", | |
| info="Model tag or version set by the provider" | |
| ) | |
| components['knowledge_cutoff_date'] = gr.Textbox( | |
| label="Knowledge Cutoff Date", | |
| placeholder="MM-DD-YYYY", | |
| info="Model training data cutoff date" | |
| ) | |
| components['url'] = gr.Textbox( | |
| label="System URL", | |
| placeholder="e.g., https://huggingface.co/model-name", | |
| info="URL to the model, paper, or documentation" | |
| ) | |
| components['type'] = gr.Dropdown( | |
| choices=[ | |
| "Generative Model", | |
| "Discriminative Model/Classifier", | |
| "Regressor", | |
| "(Reinforcement Learning) Agent", | |
| "Other" | |
| ], | |
| label="System Type", | |
| value="Generative Model", | |
| info="Primary category of the AI system" | |
| ) | |
| components['model_type'] = gr.Radio( | |
| ["Foundational Model", "Fine-tuned Model", "Doesn't apply"], | |
| label="Model Type", | |
| info="Primary category of the model") | |
| components['input modalities'] = gr.CheckboxGroup( | |
| choices=[ | |
| "Text", | |
| "Image", | |
| "Audio", | |
| "Video", | |
| "Tabular", | |
| ], | |
| label="Input modalities (select all that apply)", | |
| value=["Text"], | |
| info="input modalities supported by the system" | |
| ) | |
| components['output modalities'] = gr.CheckboxGroup( | |
| choices=[ | |
| "Text", | |
| "Image", | |
| "Audio", | |
| "Video", | |
| "Tabular", | |
| ], | |
| label="Output Modalities (select all that apply)", | |
| value=["Text"], | |
| info="output modalities supported by the system" | |
| ) | |
| return list(components.values()), components | |
| def create_evaluation_sections(self) -> Tuple[List, Dict]: | |
| """Create dynamic evaluation sections from template""" | |
| all_components = [] | |
| section_components = {} | |
| for section_name, section_data in self.template.items(): | |
| with gr.Group(): | |
| gr.Markdown(f"## {section_name}") | |
| section_components[section_name] = {} | |
| for subsection_name, subsection_data in section_data.items(): | |
| with gr.Accordion(subsection_name, open=False): | |
| # Explainer text | |
| gr.Markdown(f"**Explainer:** {subsection_data['explainer']}") | |
| # Overall status | |
| status_component = gr.Radio( | |
| choices=["Yes", "No", "N/A"], | |
| label=f"Overall Status", | |
| value="N/A", | |
| info="Does this subsection apply to your system and have you conducted these evaluations?" | |
| ) | |
| # Sources/Evidence | |
| sources_component = gr.Textbox( | |
| label="Sources & Evidence", | |
| placeholder="Enter sources, papers, benchmarks, or evidence (one per line)\nExample:\nhttps://arxiv.org/abs/2402.19173\nBOLD Bias Benchmark\nInternal evaluation report", | |
| lines=4, | |
| info="Provide references to evaluations, papers, benchmarks, or internal reports" | |
| ) | |
| # Individual questions | |
| gr.Markdown("**Detailed Questions:**") | |
| question_components = {} | |
| # IMPORTANT: Add components in the correct order - status, sources, then questions | |
| all_components.extend([status_component, sources_component]) | |
| for question in subsection_data['questions']: | |
| question_component = gr.Checkbox( | |
| label=question, | |
| value=False, | |
| #info="Check if this evaluation has been performed" | |
| ) | |
| question_components[question] = question_component | |
| all_components.append(question_component) | |
| section_components[section_name][subsection_name] = { | |
| 'status': status_component, | |
| 'sources': sources_component, | |
| 'questions': question_components | |
| } | |
| return all_components, section_components | |
| def parse_sources(self, sources_text: str) -> List[Dict]: | |
| """Parse sources text into structured format""" | |
| sources = [] | |
| # Handle case where sources_text might not be a string | |
| if not isinstance(sources_text, str): | |
| return sources | |
| if not sources_text.strip(): | |
| return sources | |
| for line in sources_text.strip().split('\n'): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| # Determine source type based on content | |
| if line.startswith('http'): | |
| source_type = "π" | |
| name = line.split('/')[-1] if '/' in line else line | |
| elif 'internal' in line.lower() or 'proprietary' in line.lower(): | |
| source_type = "π’" | |
| name = line | |
| else: | |
| source_type = "π" | |
| name = line | |
| sources.append({ | |
| "type": source_type, | |
| "detail": line, | |
| "name": name | |
| }) | |
| return sources | |
| def generate_scorecard(self, *args) -> Tuple[Dict, str]: | |
| """Generate scorecard JSON from form inputs""" | |
| # Debug: Print argument types and counts | |
| print(f"Total arguments received: {len(args)}") | |
| for i, arg in enumerate(args[:10]): # Print first 10 for debugging | |
| print(f"Arg {i}: {type(arg)} = {arg}") | |
| # Extract system info (first num_args arguments) | |
| num_args = 6 | |
| name, provider, url, sys_type, inp_modalities, out_modalities = args[:num_args] | |
| remaining_args = list(args[num_args:]) | |
| # Build metadata | |
| metadata = { | |
| "Name": name or "Unknown", | |
| "Provider": provider or "Unknown", | |
| "URL": url or "", | |
| "Type": sys_type or "Unknown", | |
| "Input Modalities": inp_modalities or [], | |
| "Output Modalities": out_modalities or [] | |
| } | |
| # Build scores | |
| scores = {} | |
| arg_index = 0 | |
| for section_name, section_data in self.template.items(): | |
| scores[section_name] = {} | |
| for subsection_name, subsection_data in section_data.items(): | |
| # Get status and sources (next 2 arguments) | |
| if arg_index < len(remaining_args): | |
| status = remaining_args[arg_index] | |
| print(f"Status for {section_name}/{subsection_name}: {type(status)} = {status}") | |
| else: | |
| status = "N/A" | |
| if arg_index + 1 < len(remaining_args): | |
| sources_text = remaining_args[arg_index + 1] | |
| print(f"Sources for {section_name}/{subsection_name}: {type(sources_text)} = {sources_text}") | |
| else: | |
| sources_text = "" | |
| # Ensure sources_text is a string | |
| if not isinstance(sources_text, str): | |
| sources_text = str(sources_text) if sources_text is not None else "" | |
| # Parse sources | |
| sources = self.parse_sources(sources_text) | |
| # Get question responses | |
| questions_dict = {} | |
| question_start_index = arg_index + 2 | |
| num_questions = len(subsection_data['questions']) | |
| for i, question in enumerate(subsection_data['questions']): | |
| q_index = question_start_index + i | |
| if q_index < len(remaining_args): | |
| questions_dict[question] = remaining_args[q_index] | |
| else: | |
| questions_dict[question] = False | |
| # Store subsection data | |
| scores[section_name][subsection_name] = { | |
| "status": status, | |
| "sources": sources, | |
| "questions": questions_dict | |
| } | |
| # Move to next subsection (2 for status/sources + number of questions) | |
| arg_index += 2 + num_questions | |
| # Create final scorecard | |
| scorecard = { | |
| "metadata": metadata, | |
| "scores": scores | |
| } | |
| # Generate filename | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| safe_name = (name or "ai_system").replace(' ', '_').lower() | |
| filename = f"{safe_name}_scorecard_{timestamp}.json" | |
| return scorecard, filename | |
| def create_interface(self): | |
| """Create the complete Gradio interface""" | |
| with gr.Blocks( | |
| title="AI System Evaluation Scorecard", | |
| # theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1400px !important; | |
| margin: 0 auto !important; | |
| padding: 20px !important; | |
| width: 95% !important; | |
| } | |
| .main { | |
| max-width: 1400px !important; | |
| margin: 0 auto !important; | |
| width: 100% !important; | |
| } | |
| .container { | |
| max-width: 1400px !important; | |
| margin: 0 auto !important; | |
| width: 100% !important; | |
| } | |
| .accordion-header { | |
| background-color: #f0f0f0 !important; | |
| } | |
| .block { | |
| width: 100% !important; | |
| } | |
| /* Ensure form elements use full width */ | |
| .form { | |
| width: 100% !important; | |
| } | |
| /* Center the entire app */ | |
| #root { | |
| display: flex !important; | |
| justify-content: center !important; | |
| width: 100% !important; | |
| } | |
| """ | |
| ) as demo: | |
| # Header | |
| gr.Markdown(""" | |
| # π AI System Evaluation Scorecard | |
| This comprehensive evaluation form helps you assess AI systems across multiple dimensions including bias, | |
| cultural sensitivity, environmental impact, privacy, and more. Complete the sections relevant to your system | |
| to generate a detailed scorecard. | |
| --- | |
| """) | |
| # System information section | |
| system_inputs, system_components = self.create_system_info_section() | |
| # Evaluation sections | |
| eval_inputs, eval_components = self.create_evaluation_sections() | |
| self.components = {**system_components, **eval_components} | |
| # Generate button and outputs | |
| with gr.Group(): | |
| gr.Markdown("## π Generate Scorecard") | |
| with gr.Row(): | |
| generate_btn = gr.Button( | |
| "π Generate Evaluation Scorecard", | |
| variant="primary", | |
| size="lg", | |
| scale=2 | |
| ) | |
| clear_btn = gr.Button( | |
| "ποΈ Clear Form", | |
| variant="secondary", | |
| scale=1 | |
| ) | |
| # Progress indicator | |
| progress = gr.Progress() | |
| # Outputs | |
| with gr.Group(): | |
| gr.Markdown("### π Generated Scorecard") | |
| with gr.Row(): | |
| json_output = gr.JSON( | |
| label="Scorecard JSON", | |
| show_label=True | |
| ) | |
| with gr.Row(): | |
| download_file = gr.File( | |
| label="β¬οΈ Download Scorecard", | |
| visible=False, | |
| ) | |
| # Event handlers | |
| all_inputs = system_inputs + eval_inputs | |
| def generate_with_progress(*args): | |
| """Generate scorecard with progress indication""" | |
| progress(0.3, desc="Processing inputs...") | |
| scorecard, filename = self.generate_scorecard(*args) | |
| progress(0.7, desc="Generating JSON...") | |
| json_content = json.dumps(scorecard, indent=2) | |
| progress(1.0, desc="Complete!") | |
| # Save to temporary file for download | |
| with open(filename, 'w') as f: | |
| f.write(json_content) | |
| return ( | |
| scorecard, # JSON display | |
| gr.File(value=filename, visible=True), # File for download | |
| ) | |
| def clear_form(): | |
| """Clear all form inputs""" | |
| return [None] * len(all_inputs) | |
| # Wire up events | |
| generate_btn.click( | |
| fn=generate_with_progress, | |
| inputs=all_inputs, | |
| outputs=[json_output, download_file], | |
| show_progress="full" | |
| ) | |
| clear_btn.click( | |
| fn=clear_form, | |
| outputs=all_inputs | |
| ) | |
| # Add example data button | |
| with gr.Group(): | |
| gr.Markdown("## π Quick Start") | |
| example_btn = gr.Button("π Load Example Data", variant="secondary") | |
| def load_example(): | |
| """Load example data for StarCoder2-like system""" | |
| example_data = [ | |
| "StarCoder2", # name | |
| "BigCode", # provider | |
| "https://huggingface.co/bigcode/starcoder2-15b", # url | |
| "Generative Model", # type | |
| ["Text"], # input modalities | |
| ["Text"], # output modalities | |
| ] | |
| # Add default values for evaluation sections (all N/A initially) | |
| remaining_defaults = [] | |
| for section_name, section_data in self.template.items(): | |
| for subsection_name, subsection_data in section_data.items(): | |
| remaining_defaults.extend([ | |
| "N/A", # status | |
| "", # sources | |
| *([False] * len(subsection_data['questions'])) # questions | |
| ]) | |
| return example_data + remaining_defaults | |
| example_btn.click( | |
| fn=load_example, | |
| outputs=all_inputs | |
| ) | |
| with gr.Group(): | |
| #todo: needs implementation | |
| gr.Markdown("## π€ Upload&Test your Eval Card JSON") | |
| uploaded_file = gr.File(label="Upload JSON File", file_types=[".json"]) | |
| uploaded_preview = gr.JSON(label="Preview of Uploaded Content") | |
| uploaded_file.change(fn=load_uploaded_json, inputs=uploaded_file, outputs=uploaded_preview) | |
| with gr.Group(): | |
| #todo: needs implementation | |
| gr.Markdown("## π¬ Submit Your Scorecard to the Eval Cards Repository") | |
| uploaded_file = gr.File(label="Upload JSON File", file_types=[".json"]) | |
| uploaded_file.change(fn=load_uploaded_json, inputs=uploaded_file, outputs=uploaded_preview) | |
| gr.Markdown(""" | |
| ### π¬ Submit Your Scorecard to the Eval Cards Repository | |
| Once downloaded, you can contribute by submitting a pull request to [Eval Cards Space](https://huggingface.co/spaces/evaleval/Eval_Cards). | |
| Place your file in the `submissions/` directory. | |
| """) | |
| return demo | |
| def load_uploaded_json(file): | |
| if file is None: | |
| return {} | |
| try: | |
| with open(file.name, 'r') as f: | |
| return json.load(f) | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def main(): | |
| """Main function to run the application""" | |
| try: | |
| # Create the evaluation form | |
| eval_form = AIEvaluationForm("questions.yaml") | |
| # Create and launch the interface | |
| demo = eval_form.create_interface() | |
| print("π Launching AI Evaluation Scorecard...") | |
| print(f"π Loading questions from: {eval_form.template_file}") | |
| print(f"π Found {len(eval_form.template)} evaluation categories") | |
| # Count total questions | |
| total_questions = sum( | |
| len(subsection['questions']) | |
| for section in eval_form.template.values() | |
| for subsection in section.values() | |
| ) | |
| print(f"β Total evaluation questions: {total_questions}") | |
| demo.launch( | |
| ssr_mode=False, | |
| share=False, | |
| inbrowser=False, | |
| show_error=True, | |
| quiet=False | |
| ) | |
| except FileNotFoundError as e: | |
| print(f"β Error: {e}") | |
| print("Please ensure 'questions.yaml' exists in the current directory.") | |
| except Exception as e: | |
| print(f"β Unexpected error: {e}") | |
| if __name__ == "__main__": | |
| main() |