Spaces:

kyleledbetter
/

responsibleGPT

Build error

App Files Files Community

kyleledbetter commited on Apr 6, 2023

Commit

a5ba058

1 Parent(s): 6784da7

feat(app): gpt, dashboard, and dark mode

Browse files

Files changed (1) hide show

app.py +168 -57

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import requests
 import json
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForQuestionAnswering
 from datasets import load_dataset
 import datasets
@@ -70,6 +70,7 @@ def generate_label_map(dataset):
       label_map = {i: label for i, label in enumerate(set(dataset["label"]))}
   return label_map
 def calculate_fairness_score(results, label_map):
   true_labels = [r[1] for r in results]
   pred_labels = [r[2] for r in results]
@@ -88,7 +89,7 @@ def calculate_fairness_score(results, label_map):
       cm = confusion_matrix(true_group_labels, pred_group_labels, labels=list(group_names))
       group_cms[group] = cm
-  # Calculate fairness score
   score = 0
   for i, group1 in enumerate(group_names):
       for j, group2 in enumerate(group_names):
@@ -100,6 +101,7 @@ def calculate_fairness_score(results, label_map):
   return accuracy, score
 def calculate_per_class_metrics(true_labels, pred_labels, label_map, metric='accuracy'):
     unique_labels = sorted(label_map.values())
     metrics = []
@@ -119,12 +121,31 @@ def calculate_per_class_metrics(true_labels, pred_labels, label_map, metric='acc
     return metrics
-def generate_visualization(visualization_type, results, label_map):
     true_labels = [r[1] for r in results]
     pred_labels = [r[2] for r in results]
     if visualization_type == "confusion_matrix":
-        return generate_report_card(results, label_map)["fig"]
     elif visualization_type == "per_class_accuracy":
         per_class_accuracy = calculate_per_class_metrics(
             true_labels, pred_labels, label_map, metric='accuracy')
@@ -139,8 +160,17 @@ def generate_visualization(visualization_type, results, label_map):
                 marker_color=colors[i % len(colors)]
             ))
-        fig.update_layout(title='Per-Class Accuracy',
-                        xaxis_title='Class', yaxis_title='Accuracy')
         return fig
     elif visualization_type == "per_class_f1":
         per_class_f1 = calculate_per_class_metrics(
@@ -156,35 +186,107 @@ def generate_visualization(visualization_type, results, label_map):
                 marker_color=colors[i % len(colors)]
             ))
-        fig.update_layout(title='Per-Class F1-Score',
-                        xaxis_title='Class', yaxis_title='F1-Score')
         return fig
     else:
         raise ValueError(f"Invalid visualization type: {visualization_type}")
-def generate_report_card(results, label_map):
   true_labels = [r[1] for r in results]
   pred_labels = [r[2] for r in results]
-  cm = confusion_matrix(true_labels, pred_labels,
-                        labels=list(label_map.values()))
-  # Create the plotly figure
-  fig = make_subplots(rows=1, cols=1)
-  fig.add_trace(go.Heatmap(
-      z=cm,
-      x=list(label_map.values()),
-      y=list(label_map.values()),
-      colorscale='RdYlGn',
-      colorbar=dict(title='# of Samples')
-  ))
   fig.update_layout(
       height=500, width=600,
       title='Confusion Matrix',
       xaxis=dict(title='Predicted Labels'),
-      yaxis=dict(title='True Labels', autorange='reversed')
   )
   # Create the text output
@@ -197,31 +299,6 @@ def generate_report_card(results, label_map):
   per_class_f1 = calculate_per_class_metrics(
         true_labels, pred_labels, label_map, metric='f1')
-  text_output = html.Div(children=[
-      html.H2('Performance Metrics'),
-      html.Div(children=[
-          html.Div(children=[
-              html.H3('Accuracy'),
-              html.H4(f'{accuracy}')
-          ], className='metric'),
-          html.Div(children=[
-              html.H3('Fairness Score'),
-              # html.H4(f'{fairness_score}')
-              html.H4(
-                  f'Accuracy: {fairness_score[0]:.2f}, Score: {fairness_score[1]:.2f}')
-          ], className='metric'),
-      ], className='metric-container'),
-  ], className='text-output')
-  # Combine the plot and text output into a Dash container
-  # report_card = html.Div([
-  #    dcc.Graph(figure=fig),
-  #    text_output,
-  # ])
-  # return report_card
   report_card = {
       "fig": fig,
       "accuracy": accuracy,
@@ -232,9 +309,26 @@ def generate_report_card(results, label_map):
   return report_card
   # return fig, text_output
-def app(model_type: str, model_name_or_path: str, dataset_name: str, config_name: str, dataset_split: str, num_samples: int, visualization_type: str):
   tokenizer, model = load_model(
       model_type, model_name_or_path, dataset_name, config_name)
@@ -277,17 +371,33 @@ def app(model_type: str, model_name_or_path: str, dataset_name: str, config_name
   # return fig, text_output
-  report_card = generate_report_card(results, label_map)
-  visualization = generate_visualization(visualization_type, results, label_map)
   per_class_metrics_str = "\n".join([f"{label}: Acc {acc:.2f}, F1 {f1:.2f}" for label, acc, f1 in zip(
       label_map.values(), report_card['per_class_accuracy'], report_card['per_class_f1'])])
   # return report_card["fig"], f"Accuracy: {report_card['accuracy']}, Fairness Score: {report_card['fairness_score'][1]:.2f}"
   # return f"Accuracy: {report_card['accuracy']}, Fairness Score: {report_card['fairness_score'][1]:.2f}", report_card["fig"]
-  return (f"Accuracy: {report_card['accuracy']}, Fairness Score: {report_card['fairness_score'][1]:.2f}\n\n"
-        f"Per-Class Metrics:\n{per_class_metrics_str}"), visualization
 interface = gr.Interface(
     fn=app,
@@ -304,8 +414,9 @@ interface = gr.Interface(
             choices=["train", "validation", "test"], label="Dataset Split", default="validation"),
         gr.inputs.Number(default=100, label="Number of Samples"),
         gr.inputs.Dropdown(
-            choices=["confusion_matrix", "per_class_accuracy", "per_class_f1"], label="Visualization Type", default="confusion_matrix"
         ),
     ],
     # outputs=gr.Plot(),
     # outputs=gr.outputs.HTML(),

 import gradio as gr
 import requests
 import json
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForQuestionAnswering
 from datasets import load_dataset
 import datasets
       label_map = {i: label for i, label in enumerate(set(dataset["label"]))}
   return label_map
+# Explain fairness score: https://arxiv.org/pdf/1908.09635.pdf
 def calculate_fairness_score(results, label_map):
   true_labels = [r[1] for r in results]
   pred_labels = [r[2] for r in results]
       cm = confusion_matrix(true_group_labels, pred_group_labels, labels=list(group_names))
       group_cms[group] = cm
+  # Calculate fairness score which means the average difference between confusion matrices
   score = 0
   for i, group1 in enumerate(group_names):
       for j, group2 in enumerate(group_names):
   return accuracy, score
+# Per-class metrics means the metrics for each class, and the class is defined by the label_map
 def calculate_per_class_metrics(true_labels, pred_labels, label_map, metric='accuracy'):
     unique_labels = sorted(label_map.values())
     metrics = []
     return metrics
+def generate_fairness_statement(accuracy, fairness_score):
+    accuracy_level = "high" if accuracy >= 0.85 else "moderate" if accuracy >= 0.7 else "low"
+    fairness_level = "low" if fairness_score <= 0.15 else "moderate" if fairness_score <= 0.3 else "high"
+    # statement = f"The model has a {accuracy_level} overall accuracy of {accuracy * 100:.2f}% and a {fairness_level} fairness score of {fairness_score:.2f}. "
+    statement = f"Assessment: "
+    if fairness_level == "low":
+        statement += f"The low fairness score ({fairness_score:.2f}) and accuracy ({accuracy * 100:.2f}%) indicate that the model is relatively fair and does not exhibit significant bias across different groups."
+    elif fairness_level == "moderate":
+        statement += f"The moderate fairness score ({fairness_score:.2f}) and accuracy ({accuracy * 100:.2f}%) suggest that the model may have some bias across different groups, and further investigation is needed to ensure it does not disproportionately affect certain groups."
+    else:
+        statement += f"The high fairness score ({fairness_score:.2f}) and accuracy ({accuracy * 100:.2f}%) indicate that the model exhibits significant bias across different groups, and it's recommended to address this issue to ensure fair predictions for all groups."
+    return statement
+def generate_visualization(visualization_type, results, label_map, chart_mode):
     true_labels = [r[1] for r in results]
     pred_labels = [r[2] for r in results]
+    background_color = "white" if chart_mode == "Light" else "black"
+    text_color = "black" if chart_mode == "Light" else "white"
     if visualization_type == "confusion_matrix":
+        return generate_report_card(results, label_map, chart_mode)["fig"]
     elif visualization_type == "per_class_accuracy":
         per_class_accuracy = calculate_per_class_metrics(
             true_labels, pred_labels, label_map, metric='accuracy')
                 marker_color=colors[i % len(colors)]
             ))
+        fig.update_xaxes(showgrid=True, gridwidth=1,
+                         gridcolor='LightGray', linecolor='black', linewidth=1)
+        fig.update_yaxes(showgrid=True, gridwidth=1,
+                         gridcolor='LightGray', linecolor='black', linewidth=1)
+        fig.update_layout(plot_bgcolor=background_color,
+                          paper_bgcolor=background_color,
+                          font=dict(color=text_color),
+                          title='Per-Class Accuracy',
+                          xaxis_title='Class', yaxis_title='Accuracy'
+                          )
         return fig
     elif visualization_type == "per_class_f1":
         per_class_f1 = calculate_per_class_metrics(
                 marker_color=colors[i % len(colors)]
             ))
+        fig.update_xaxes(showgrid=True, gridwidth=1,
+                         gridcolor='LightGray', linecolor='black', linewidth=1)
+        fig.update_yaxes(showgrid=True, gridwidth=1,
+                         gridcolor='LightGray', linecolor='black', linewidth=1)
+        fig.update_layout(plot_bgcolor=background_color,
+                          paper_bgcolor=background_color,
+                          font=dict(color=text_color),
+                          title='Per-Class F1-Score',
+                          xaxis_title='Class', yaxis_title='F1-Score'
+                          )
         return fig
+    elif visualization_type == "interactive_dashboard":
+        return generate_interactive_dashboard(results, label_map, chart_mode)
     else:
         raise ValueError(f"Invalid visualization type: {visualization_type}")
+def generate_interactive_dashboard(results, label_map, chart_mode):
+    true_labels = [r[1] for r in results]
+    pred_labels = [r[2] for r in results]
+    colors = ['#EF553B', '#00CC96', '#636EFA',   '#AB63FA', '#FFA15A',
+              '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']
+    background_color = "white" if chart_mode == "Light" else "black"
+    text_color = "black" if chart_mode == "Light" else "white"
+    # Create confusion matrix
+    cm_fig = generate_report_card(results, label_map, chart_mode)["fig"]
+    # Create per-class accuracy bar chart
+    pca_data = calculate_per_class_metrics(true_labels, pred_labels, label_map, metric='accuracy')
+    pca_fig = go.Bar(x=list(label_map.values()), y=pca_data, marker=dict(color=colors))
+    # Create per-class F1-score bar chart
+    pcf_data = calculate_per_class_metrics(true_labels, pred_labels, label_map, metric='f1')
+    pcf_fig = go.Bar(x=list(label_map.values()), y=pcf_data, marker=dict(color=colors))
+    # Combine all charts into a mixed subplot
+    fig = make_subplots(rows=2, cols=2, shared_xaxes=True, specs=[[{"colspan": 2}, None],
+                                               [{}, {}]],
+                        print_grid=True,subplot_titles=(
+        "Confusion Matrix", "Per-Class Accuracy", "Per-Class F1-Score"))
+    fig.add_trace(cm_fig['data'][0], row=1, col=1)
+    fig.add_trace(pca_fig, row=2, col=1)
+    fig.add_trace(pcf_fig, row=2, col=2)
+    fig.update_xaxes(showgrid=True, gridwidth=1,
+                     gridcolor='LightGray', linecolor='black', linewidth=1)
+    fig.update_yaxes(showgrid=True, gridwidth=1,
+                     gridcolor='LightGray', linecolor='black', linewidth=1)
+    # Update layout
+    fig.update_layout(height=700, width=650,
+                      plot_bgcolor=background_color,
+                      paper_bgcolor=background_color,
+                      font=dict(color=text_color),
+                      title="Fairness Report", showlegend=False
+                      )
+    return fig
+def generate_report_card(results, label_map, chart_mode):
   true_labels = [r[1] for r in results]
   pred_labels = [r[2] for r in results]
+  background_color = "white" if chart_mode == "Light" else "black"
+  text_color = "black" if chart_mode == "Light" else "white"
+  cm = confusion_matrix(true_labels, pred_labels)
+  # Normalize the confusion matrix
+  cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+  # Create a custom color scale
+  custom_color_scale = np.zeros(cm_normalized.shape, dtype='str')
+  for i in range(cm_normalized.shape[0]):
+        for j in range(cm_normalized.shape[1]):
+            custom_color_scale[i, j] = '#EF553B' if i == j else '#00CC96'
+  fig = go.Figure(go.Heatmap(z=cm_normalized,
+                            x=list(label_map.values()),
+                            y=list(label_map.values()),
+                            text=cm,
+                            hovertemplate='%{text}',
+                             colorscale=[[0, '#EF553B'], [
+                                 1, '#00CC96']],
+                            showscale=False,
+                            zmin=0, zmax=1,
+                            customdata=custom_color_scale))
+  fig.update_xaxes(showgrid=True, gridwidth=1,
+                   gridcolor='LightGray', linecolor='black', linewidth=1)
+  fig.update_yaxes(showgrid=True, gridwidth=1,
+                    gridcolor='LightGray', linecolor='black', linewidth=1)
   fig.update_layout(
+      plot_bgcolor=background_color,
+      paper_bgcolor=background_color,
+      font=dict(color=text_color),
       height=500, width=600,
       title='Confusion Matrix',
       xaxis=dict(title='Predicted Labels'),
+      yaxis=dict(title='True Labels')
   )
   # Create the text output
   per_class_f1 = calculate_per_class_metrics(
         true_labels, pred_labels, label_map, metric='f1')
   report_card = {
       "fig": fig,
       "accuracy": accuracy,
   return report_card
   # return fig, text_output
+def generate_insights(custom_text, model_name, dataset_name, accuracy, fairness_score, report_card, generator):
+    per_class_metrics = {
+        'accuracy': report_card.get('per_class_accuracy', []),
+        'f1': report_card.get('per_class_f1', [])
+    }
+    if not per_class_metrics['accuracy'] or not per_class_metrics['f1']:
+        input_text = f"{custom_text} The model {model_name} has been evaluated on the {dataset_name} dataset. It has an overall accuracy of {accuracy * 100:.2f}%. The fairness score is {fairness_score:.2f}. Per-class metrics could not be calculated. Please provide some interesting insights about the fairness and bias of the model."
+    else:
+        input_text = f"{custom_text} The model {model_name} has been evaluated on the {dataset_name} dataset. It has an overall accuracy of {accuracy * 100:.2f}%. The fairness score is {fairness_score:.2f}. The per-class metrics are: {per_class_metrics}. Please provide some interesting insights about the fairness, bias, and per-class performance."
+    insights = generator(input_text, max_length=600,
+                        do_sample=True, temperature=0.7)
+    return insights[0]['generated_text']
+def app(model_type: str, model_name_or_path: str, dataset_name: str, config_name: str, dataset_split: str, num_samples: int, visualization_type: str, chart_mode: str):
   tokenizer, model = load_model(
       model_type, model_name_or_path, dataset_name, config_name)
   # return fig, text_output
+  report_card = generate_report_card(results, label_map, chart_mode)
+  visualization = generate_visualization(visualization_type, results, label_map, chart_mode)
   per_class_metrics_str = "\n".join([f"{label}: Acc {acc:.2f}, F1 {f1:.2f}" for label, acc, f1 in zip(
       label_map.values(), report_card['per_class_accuracy'], report_card['per_class_f1'])])
+  accuracy, fairness_score = calculate_fairness_score(results, label_map)
+  fairness_statement = generate_fairness_statement(accuracy, fairness_score)
+  # Use a GPU if available, otherwise use -1 for CPU.
+  generator = pipeline(
+      'text-generation', model='gpt2', device=-1)  # Use EleutherAI/gpt-neo-1.3B or EleutherAI/GPT-J-6B for GPT3 for distilgpt2 for GPT2
+  per_class_metrics = {
+      'accuracy': report_card['per_class_accuracy'],
+      'f1': report_card['per_class_f1']
+  }
+  custom_text = fairness_statement
+  insights = generate_insights(custom_text, model_name_or_path,
+                               dataset_name, accuracy, fairness_score, report_card, generator)
   # return report_card["fig"], f"Accuracy: {report_card['accuracy']}, Fairness Score: {report_card['fairness_score'][1]:.2f}"
   # return f"Accuracy: {report_card['accuracy']}, Fairness Score: {report_card['fairness_score'][1]:.2f}", report_card["fig"]
+  return (f"{insights}\n\n"
+          f"Accuracy: {report_card['accuracy']}, Fairness Score: {report_card['fairness_score'][1]: .2f}\n\n"
+          f"Per-Class Metrics:\n{per_class_metrics_str}"), visualization
 interface = gr.Interface(
     fn=app,
             choices=["train", "validation", "test"], label="Dataset Split", default="validation"),
         gr.inputs.Number(default=100, label="Number of Samples"),
         gr.inputs.Dropdown(
+            choices=["interactive_dashboard", "confusion_matrix", "per_class_accuracy", "per_class_f1"], label="Visualization Type", default="interactive_dashboard"
         ),
+        gr.inputs.Radio(["Light", "Dark"], label="Chart Mode", default="Light"),
     ],
     # outputs=gr.Plot(),
     # outputs=gr.outputs.HTML(),