import gradio as gr import pandas as pd import plotly.express as px def display_table(exam_type): if exam_type == "Armenian Exams": df = pd.read_csv('unified_exam_results.csv') df = df.sort_values(by='Average', ascending=False) cols = df.columns.tolist() cols.insert(1, cols.pop(cols.index('Average'))) df = df[cols] df.rename(columns={'Armenian language and literature': 'Armenian language\nand literature'}, inplace=True) df = df.round(4) elif exam_type == "MMLU-Pro-Hy": df = pd.read_csv('mmlu_pro_hy_results.csv') subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology'] df['Average'] = df[subject_cols].mean(axis=1) df = df.sort_values(by='Average', ascending=False) cols = df.columns.tolist() cols.remove('Accuracy') cols.insert(1, cols.pop(cols.index('Average'))) cols.append(cols.pop(cols.index('Other'))) df = df[cols] df = df.round(4) return df def create_bar_chart(exam_type, plot_column): if exam_type == "Armenian Exams": df = pd.read_csv('unified_exam_results.csv') df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True) x_col = plot_column title = f'{plot_column}' x_range_max = 20 def get_label(score): if score < 8: return "Fail" elif 8 <= score <= 18: return "Pass" else: return "Distinction" df['Test Result'] = df[plot_column].apply(get_label) color_discrete_map = { "Fail": "#ff5f56", "Pass": "#ffbd2e", "Distinction": "#27c93f" } fig = px.bar(df, x=x_col, y='Model', color=df['Test Result'], color_discrete_map=color_discrete_map, labels={x_col: 'Score', 'Model': 'Model'}, title=title, orientation='h') fig.update_layout( xaxis=dict(range=[0, x_range_max]), title=dict(text=title, font=dict(size=16)), xaxis_title=dict(font=dict(size=12)), yaxis_title=dict(font=dict(size=12)), yaxis=dict(autorange="reversed"), autosize=True ) return fig elif exam_type == "MMLU-Pro-Hy": df = pd.read_csv('mmlu_pro_hy_results.csv') subject_cols = ['Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Other', 'Philosophy', 'Physics', 'Psychology'] df['Average'] = df[subject_cols].mean(axis=1) df = df.sort_values(by=plot_column, ascending=False).reset_index(drop=True) df = df.drop(columns=['Accuracy']) x_col = plot_column title = f'{plot_column}' x_range_max = 1.0 fig = px.bar(df, x=x_col, y='Model', color=x_col, color_continuous_scale='Viridis', labels={x_col: 'Accuracy', 'Model': 'Model'}, title=title, orientation='h', range_color=[0,1]) fig.update_layout( xaxis=dict(range=[0, x_range_max]), title=dict(text=title, font=dict(size=16)), xaxis_title=dict(font=dict(size=12)), yaxis_title=dict(font=dict(size=12)), yaxis=dict(autorange="reversed"), autosize=True ) return fig with gr.Blocks() as app: with gr.Tabs(): with gr.TabItem("Armenian Unified Exams"): gr.Markdown("# Armenian Unified Test Exams") gr.HTML(f"""
This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
""") table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams")) plot_column_dropdown = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot') plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown) with gr.TabItem("MMLU-Pro-Hy"): gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)") gr.HTML(f"""
This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
""") table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy")) subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other'] plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot') plot_output_mmlu = gr.Plot(lambda column: create_bar_chart("MMLU-Pro-Hy", column), inputs=plot_column_dropdown_mmlu) app.launch(share=True, debug=True)