DataVerse

Running

App Files Files Community

evijit HF Staff commited on Jun 19

Commit

addb03f

verified ·

1 Parent(s): 9d43fb0

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -53

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# --- app.py (Dataverse Explorer) ---
 import gradio as gr
 import pandas as pd
 import plotly.express as px
@@ -8,7 +6,7 @@ from datasets import load_dataset
 # --- Constants ---
 TOP_K_CHOICES = list(range(5, 51, 5))
-HF_DATASET_ID = "evijit/dataverse_daily_data" # <-- Changed to the new dataset repo
 TAG_FILTER_CHOICES = [
     "None", "Audio & Speech", "Time series", "Robotics", "Music",
     "Video", "Images", "Text", "Biomedical", "Sciences"
@@ -19,12 +17,8 @@ def load_datasets_data():
     start_time = time.time()
     print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
     try:
-        # Load the dataset from the Hub
         dataset_dict = load_dataset(HF_DATASET_ID)
-        # Convert the first split (usually 'train') to a pandas DataFrame
         df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
-        # No parameter processing needed for datasets
         msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
         print(msg)
         return df, True, msg
@@ -33,56 +27,68 @@ def load_datasets_data():
         print(err_msg)
         return pd.DataFrame(), False, err_msg
-def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_orgs=None):
-    """Filter and prepare data for the treemap visualization."""
     if df is None or df.empty:
         return pd.DataFrame()
     filtered_df = df.copy()
-    # Map UI-friendly tag names to the boolean columns in the dataframe
     col_map = {
         "Audio & Speech": "is_audio_speech", "Music": "has_music", "Robotics": "has_robot",
         "Biomedical": "is_biomed", "Time series": "has_series", "Sciences": "has_science",
         "Video": "has_video", "Images": "has_image", "Text": "has_text"
     }
-    # Apply tag filter if a valid one is selected
     if tag_filter and tag_filter != "None" and tag_filter in col_map:
         if col_map[tag_filter] in filtered_df.columns:
             filtered_df = filtered_df[filtered_df[col_map[tag_filter]]]
-    # Skip specified organizations if any are provided
-    if skip_orgs and len(skip_orgs) > 0 and "organization" in filtered_df.columns:
-        filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
     if filtered_df.empty:
         return pd.DataFrame()
-    # Ensure the metric column is numeric
     if count_by not in filtered_df.columns:
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
-    # Group by organization and find the top K based on the selected metric
-    org_totals = filtered_df.groupby("organization")[count_by].sum().nlargest(top_k, keep='first')
-    top_orgs_list = org_totals.index.tolist()
-    # Prepare the final data structure for the treemap
-    treemap_data = filtered_df[filtered_df["organization"].isin(top_orgs_list)][["id", "organization", count_by]].copy()
-    treemap_data["root"] = "datasets" # Set the root node for the treemap
     return treemap_data
 def create_treemap(treemap_data, count_by, title=None):
-    """Generate the Plotly treemap figure."""
-    if treemap_data.empty:
-        # Create a placeholder figure if no data matches the filters
         fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
-    # Create the main treemap
-    fig = px.treemap(treemap_data, path=["root", "organization", "id"], values=count_by,
                      title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     fig.update_traces(
@@ -101,7 +107,6 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
     with gr.Row():
         with gr.Column(scale=1):
-            # --- Control Panel ---
             count_by_dropdown = gr.Dropdown(
                 label="Metric",
                 choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")],
@@ -120,9 +125,10 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
                 value=25
             )
-            skip_orgs_textbox = gr.Textbox(
-                label="Organizations to Skip (comma-separated)",
-                value="huggingface,google,facebook,microsoft,amazon"
             )
             generate_plot_button = gr.Button(
@@ -132,30 +138,24 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
             )
         with gr.Column(scale=3):
-            # --- Output Area ---
             plot_output = gr.Plot()
             status_message_md = gr.Markdown("Initializing...")
             data_info_md = gr.Markdown("")
-    # --- Controller Functions ---
     def _update_button_interactivity(is_loaded_flag):
-        """Enable the generate button once data is loaded."""
         return gr.update(interactive=is_loaded_flag)
     def ui_load_data_controller(progress=gr.Progress()):
-        """Handles the initial data loading and updates the UI with status."""
         progress(0, desc=f"Loading dataset '{HF_DATASET_ID}'...")
         try:
             current_df, load_success_flag, status_msg_from_load = load_datasets_data()
             if load_success_flag:
                 progress(0.9, desc="Processing data...")
-                # Format the timestamp for display
                 date_display = "Pre-processed (date unavailable)"
                 if 'data_download_timestamp' in current_df.columns and pd.notna(current_df['data_download_timestamp'].iloc[0]):
                     ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
                     date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
-                # Create the data information summary
                 data_info_text = (
                     f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
                     f"- Status: {status_msg_from_load}\n"
@@ -174,55 +174,51 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
         return current_df, load_success_flag, data_info_text, status_msg_ui
     def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
-                                   skip_orgs_input, df_current_datasets, progress=gr.Progress()):
-        """Handles the plot generation based on user inputs."""
         if df_current_datasets is None or df_current_datasets.empty:
             return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded."
-        progress(0.1, desc="Preparing data...")
-        orgs_to_skip = [org.strip() for org in skip_orgs_input.split(',') if org.strip()]
-        # Prepare data for the treemap
-        treemap_df = make_treemap_data(df_current_datasets, metric_choice, k_orgs, tag_choice, orgs_to_skip)
         progress(0.7, desc="Generating plot...")
-        # Create a user-friendly title for the chart
         title_labels = {"downloads": "Downloads (last 30 days)", "downloadsAllTime": "Downloads (All Time)", "likes": "Likes"}
         chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
         plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
-        # Generate summary statistics for the plot
         if treemap_df.empty:
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
-            total_items_in_plot = len(treemap_df['id'].unique())
             total_value_in_plot = treemap_df[metric_choice].sum()
-            plot_stats_md = f"## Plot Statistics\n- **Datasets shown**: {total_items_in_plot:,}\n- **Total {metric_choice}**: {int(total_value_in_plot):,}"
         return plotly_fig, plot_stats_md
-    # --- Event Wiring ---
-    # When the app loads, trigger the data fetching process
     demo.load(
         fn=ui_load_data_controller,
         inputs=[],
         outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md]
     )
-    # When the loading is complete, enable the "Generate Plot" button
     loading_complete_state.change(
         fn=_update_button_interactivity,
         inputs=loading_complete_state,
         outputs=generate_plot_button
     )
-    # When the "Generate Plot" button is clicked, trigger the plot generation
     generate_plot_button.click(
         fn=ui_generate_plot_controller,
         inputs=[count_by_dropdown, tag_filter_dropdown, top_k_dropdown,
-                skip_orgs_textbox, datasets_data_state],
         outputs=[plot_output, status_message_md]
     )

 import gradio as gr
 import pandas as pd
 import plotly.express as px
 # --- Constants ---
 TOP_K_CHOICES = list(range(5, 51, 5))
+HF_DATASET_ID = "evijit/dataverse_daily_data"
 TAG_FILTER_CHOICES = [
     "None", "Audio & Speech", "Time series", "Robotics", "Music",
     "Video", "Images", "Text", "Biomedical", "Sciences"
     start_time = time.time()
     print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
     try:
         dataset_dict = load_dataset(HF_DATASET_ID)
         df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
         msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
         print(msg)
         return df, True, msg
         print(err_msg)
         return pd.DataFrame(), False, err_msg
+# --- MODIFIED: Core logic to create the "Other" category ---
+def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
+    """
+    Filter data and prepare it for the treemap, grouping smaller organizations
+    into an "Other" category.
+    """
     if df is None or df.empty:
         return pd.DataFrame()
     filtered_df = df.copy()
     col_map = {
         "Audio & Speech": "is_audio_speech", "Music": "has_music", "Robotics": "has_robot",
         "Biomedical": "is_biomed", "Time series": "has_series", "Sciences": "has_science",
         "Video": "has_video", "Images": "has_image", "Text": "has_text"
     }
     if tag_filter and tag_filter != "None" and tag_filter in col_map:
         if col_map[tag_filter] in filtered_df.columns:
             filtered_df = filtered_df[filtered_df[col_map[tag_filter]]]
     if filtered_df.empty:
         return pd.DataFrame()
     if count_by not in filtered_df.columns:
         filtered_df[count_by] = 0.0
     filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
+    # 1. Calculate totals for ALL organizations
+    all_org_totals = filtered_df.groupby("organization")[count_by].sum()
+    # 2. Identify the Top N organizations
+    top_org_totals = all_org_totals.nlargest(top_k, keep='first')
+    # 3. Calculate the sum for the "Other" category
+    other_total = all_org_totals.sum() - top_org_totals.sum()
+    # 4. Create the aggregated DataFrame for the plot
+    treemap_data = top_org_totals.reset_index()
+    # 5. Add the "Other" row if its value is greater than zero
+    if other_total > 0:
+        other_row = pd.DataFrame([{'organization': 'Other', count_by: other_total}])
+        treemap_data = pd.concat([treemap_data, other_row], ignore_index=True)
+    # 6. Apply the skip filter at the end (e.g., to hide the "Other" category)
+    if skip_cats and len(skip_cats) > 0:
+        treemap_data = treemap_data[~treemap_data["organization"].isin(skip_cats)]
+    treemap_data["root"] = "datasets"
     return treemap_data
+# --- MODIFIED: Simplified path for the treemap ---
 def create_treemap(treemap_data, count_by, title=None):
+    """Generate the Plotly treemap figure from aggregated data."""
+    if treemap_data.empty or treemap_data[count_by].sum() == 0:
         fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
         fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
         return fig
+    # The path is now simpler as we are not showing individual dataset IDs
+    fig = px.treemap(treemap_data, path=["root", "organization"], values=count_by,
                      title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
     fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     fig.update_traces(
     with gr.Row():
         with gr.Column(scale=1):
             count_by_dropdown = gr.Dropdown(
                 label="Metric",
                 choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")],
                 value=25
             )
+            # --- MODIFIED: UI updated to reflect the new functionality ---
+            skip_cats_textbox = gr.Textbox(
+                label="Categories to Skip (e.g., Other)",
+                value="Other"
             )
             generate_plot_button = gr.Button(
             )
         with gr.Column(scale=3):
             plot_output = gr.Plot()
             status_message_md = gr.Markdown("Initializing...")
             data_info_md = gr.Markdown("")
     def _update_button_interactivity(is_loaded_flag):
         return gr.update(interactive=is_loaded_flag)
     def ui_load_data_controller(progress=gr.Progress()):
         progress(0, desc=f"Loading dataset '{HF_DATASET_ID}'...")
         try:
             current_df, load_success_flag, status_msg_from_load = load_datasets_data()
             if load_success_flag:
                 progress(0.9, desc="Processing data...")
                 date_display = "Pre-processed (date unavailable)"
                 if 'data_download_timestamp' in current_df.columns and pd.notna(current_df['data_download_timestamp'].iloc[0]):
                     ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
                     date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
                 data_info_text = (
                     f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
                     f"- Status: {status_msg_from_load}\n"
         return current_df, load_success_flag, data_info_text, status_msg_ui
+    # --- MODIFIED: Updated controller to handle new logic and stats ---
     def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
+                                   skip_cats_input, df_current_datasets, progress=gr.Progress()):
         if df_current_datasets is None or df_current_datasets.empty:
             return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded."
+        progress(0.1, desc="Aggregating data...")
+        cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
+        treemap_df = make_treemap_data(df_current_datasets, metric_choice, k_orgs, tag_choice, cats_to_skip)
         progress(0.7, desc="Generating plot...")
         title_labels = {"downloads": "Downloads (last 30 days)", "downloadsAllTime": "Downloads (All Time)", "likes": "Likes"}
         chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
         plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
+        # Update plot statistics to be more accurate for the new view
         if treemap_df.empty:
             plot_stats_md = "No data matches the selected filters. Please try different options."
         else:
             total_value_in_plot = treemap_df[metric_choice].sum()
+            plot_stats_md = (
+                f"## Plot Statistics\n- **Top Categories Shown**: {len(treemap_df):,}\n"
+                f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}"
+            )
         return plotly_fig, plot_stats_md
+    # --- Event Wiring (no changes needed here) ---
     demo.load(
         fn=ui_load_data_controller,
         inputs=[],
         outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md]
     )
     loading_complete_state.change(
         fn=_update_button_interactivity,
         inputs=loading_complete_state,
         outputs=generate_plot_button
     )
     generate_plot_button.click(
         fn=ui_generate_plot_controller,
         inputs=[count_by_dropdown, tag_filter_dropdown, top_k_dropdown,
+                skip_cats_textbox, datasets_data_state],
         outputs=[plot_output, status_message_md]
     )