DataVerse

Running

App Files Files Community

evijit HF Staff commited on Jun 19

Commit

e65f153

verified ·

1 Parent(s): addb03f

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -29

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import gradio as gr
 import pandas as pd
 import plotly.express as px
@@ -27,11 +29,12 @@ def load_datasets_data():
         print(err_msg)
         return pd.DataFrame(), False, err_msg
-# --- MODIFIED: Core logic to create the "Other" category ---
 def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
     """
-    Filter data and prepare it for the treemap, grouping smaller organizations
-    into an "Other" category.
     """
     if df is None or df.empty:
         return pd.DataFrame()
@@ -55,40 +58,46 @@ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
-    # 1. Calculate totals for ALL organizations
     all_org_totals = filtered_df.groupby("organization")[count_by].sum()
-    # 2. Identify the Top N organizations
-    top_org_totals = all_org_totals.nlargest(top_k, keep='first')
-    # 3. Calculate the sum for the "Other" category
-    other_total = all_org_totals.sum() - top_org_totals.sum()
-    # 4. Create the aggregated DataFrame for the plot
-    treemap_data = top_org_totals.reset_index()
-    # 5. Add the "Other" row if its value is greater than zero
     if other_total > 0:
-        other_row = pd.DataFrame([{'organization': 'Other', count_by: other_total}])
-        treemap_data = pd.concat([treemap_data, other_row], ignore_index=True)
-    # 6. Apply the skip filter at the end (e.g., to hide the "Other" category)
     if skip_cats and len(skip_cats) > 0:
-        treemap_data = treemap_data[~treemap_data["organization"].isin(skip_cats)]
-    treemap_data["root"] = "datasets"
-    return treemap_data
-# --- MODIFIED: Simplified path for the treemap ---
 def create_treemap(treemap_data, count_by, title=None):
-    """Generate the Plotly treemap figure from aggregated data."""
-    if treemap_data.empty or treemap_data[count_by].sum() == 0:
         fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
-    # The path is now simpler as we are not showing individual dataset IDs
-    fig = px.treemap(treemap_data, path=["root", "organization"], values=count_by,
                      title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     fig.update_traces(
@@ -97,7 +106,7 @@ def create_treemap(treemap_data, count_by, title=None):
     )
     return fig
-# --- Gradio UI Blocks ---
 with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     datasets_data_state = gr.State(pd.DataFrame())
     loading_complete_state = gr.State(False)
@@ -125,7 +134,6 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
                 value=25
             )
-            # --- MODIFIED: UI updated to reflect the new functionality ---
             skip_cats_textbox = gr.Textbox(
                 label="Categories to Skip (e.g., Other)",
                 value="Other"
@@ -174,7 +182,7 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
         return current_df, load_success_flag, data_info_text, status_msg_ui
-    # --- MODIFIED: Updated controller to handle new logic and stats ---
     def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
                                    skip_cats_input, df_current_datasets, progress=gr.Progress()):
         if df_current_datasets is None or df_current_datasets.empty:
@@ -190,19 +198,21 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
         chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
         plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
-        # Update plot statistics to be more accurate for the new view
         if treemap_df.empty:
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
             total_value_in_plot = treemap_df[metric_choice].sum()
             plot_stats_md = (
-                f"## Plot Statistics\n- **Top Categories Shown**: {len(treemap_df):,}\n"
                 f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}"
             )
         return plotly_fig, plot_stats_md
-    # --- Event Wiring (no changes needed here) ---
     demo.load(
         fn=ui_load_data_controller,
         inputs=[],

+# --- app.py (Dataverse Explorer - Corrected with drill-down) ---
 import gradio as gr
 import pandas as pd
 import plotly.express as px
         print(err_msg)
         return pd.DataFrame(), False, err_msg
+# --- CORRECTED: This function now preserves individual datasets for top orgs ---
 def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
     """
+    Filter data and prepare it for a multi-level treemap.
+    - Preserves individual datasets for the top K organizations.
+    - Groups all other organizations into a single "Other" category.
     """
     if df is None or df.empty:
         return pd.DataFrame()
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
+    # 1. Get total for every organization to determine the top K
     all_org_totals = filtered_df.groupby("organization")[count_by].sum()
+    top_org_names = all_org_totals.nlargest(top_k, keep='first').index.tolist()
+    # 2. Get the full data for the individual datasets belonging to the top organizations
+    top_orgs_df = filtered_df[filtered_df['organization'].isin(top_org_names)].copy()
+    # 3. Calculate the total for the "Other" category
+    other_total = all_org_totals[~all_org_totals.index.isin(top_org_names)].sum()
+    # 4. Create the final DataFrame for the plot
+    final_df_for_plot = top_orgs_df
+    # 5. Add the "Other" row as a single entry if its value is greater than zero
     if other_total > 0:
+        other_row = pd.DataFrame([{
+            'organization': 'Other',
+            'id': 'Other',  # The 'id' for the "Other" category must be defined for the path
+            count_by: other_total
+        }])
+        final_df_for_plot = pd.concat([final_df_for_plot, other_row], ignore_index=True)
+    # 6. Apply the skip filter to the organization/category level
     if skip_cats and len(skip_cats) > 0:
+        final_df_for_plot = final_df_for_plot[~final_df_for_plot['organization'].isin(skip_cats)]
+    final_df_for_plot["root"] = "datasets"
+    return final_df_for_plot
+# --- CORRECTED: The path is now restored to allow drill-down ---
 def create_treemap(treemap_data, count_by, title=None):
+    """Generate the Plotly treemap figure from the prepared data."""
+    if treemap_data.empty or treemap_data[count_by].sum() <= 0:
         fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
+    # The path is restored to `["root", "organization", "id"]` to enable drill-down.
+    # The "Other" row with id='Other' will correctly be displayed as a single block.
+    fig = px.treemap(treemap_data, path=["root", "organization", "id"], values=count_by,
                      title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     fig.update_traces(
     )
     return fig
+# --- Gradio UI Blocks (no changes needed here) ---
 with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     datasets_data_state = gr.State(pd.DataFrame())
     loading_complete_state = gr.State(False)
                 value=25
             )
             skip_cats_textbox = gr.Textbox(
                 label="Categories to Skip (e.g., Other)",
                 value="Other"
         return current_df, load_success_flag, data_info_text, status_msg_ui
+    # --- CORRECTED: Updated stats to reflect the new plot structure ---
     def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
                                    skip_cats_input, df_current_datasets, progress=gr.Progress()):
         if df_current_datasets is None or df_current_datasets.empty:
         chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
         plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
         if treemap_df.empty:
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
             total_value_in_plot = treemap_df[metric_choice].sum()
+            # Count datasets, excluding our placeholder "Other" id
+            total_datasets_in_plot = treemap_df[treemap_df['id'] != 'Other']['id'].nunique()
             plot_stats_md = (
+                f"## Plot Statistics\n- **Organizations/Categories Shown**: {treemap_df['organization'].nunique():,}\n"
+                f"- **Individual Datasets Shown**: {total_datasets_in_plot:,}\n"
                 f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}"
             )
         return plotly_fig, plot_stats_md
+    # --- Event Wiring (no changes needed) ---
     demo.load(
         fn=ui_load_data_controller,
         inputs=[],