evijit HF Staff commited on
Commit
addb03f
·
verified ·
1 Parent(s): 9d43fb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -53
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # --- app.py (Dataverse Explorer) ---
2
-
3
  import gradio as gr
4
  import pandas as pd
5
  import plotly.express as px
@@ -8,7 +6,7 @@ from datasets import load_dataset
8
 
9
  # --- Constants ---
10
  TOP_K_CHOICES = list(range(5, 51, 5))
11
- HF_DATASET_ID = "evijit/dataverse_daily_data" # <-- Changed to the new dataset repo
12
  TAG_FILTER_CHOICES = [
13
  "None", "Audio & Speech", "Time series", "Robotics", "Music",
14
  "Video", "Images", "Text", "Biomedical", "Sciences"
@@ -19,12 +17,8 @@ def load_datasets_data():
19
  start_time = time.time()
20
  print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
21
  try:
22
- # Load the dataset from the Hub
23
  dataset_dict = load_dataset(HF_DATASET_ID)
24
- # Convert the first split (usually 'train') to a pandas DataFrame
25
  df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
26
-
27
- # No parameter processing needed for datasets
28
  msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
29
  print(msg)
30
  return df, True, msg
@@ -33,56 +27,68 @@ def load_datasets_data():
33
  print(err_msg)
34
  return pd.DataFrame(), False, err_msg
35
 
36
- def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_orgs=None):
37
- """Filter and prepare data for the treemap visualization."""
 
 
 
 
38
  if df is None or df.empty:
39
  return pd.DataFrame()
40
 
41
  filtered_df = df.copy()
42
 
43
- # Map UI-friendly tag names to the boolean columns in the dataframe
44
  col_map = {
45
  "Audio & Speech": "is_audio_speech", "Music": "has_music", "Robotics": "has_robot",
46
  "Biomedical": "is_biomed", "Time series": "has_series", "Sciences": "has_science",
47
  "Video": "has_video", "Images": "has_image", "Text": "has_text"
48
  }
49
 
50
- # Apply tag filter if a valid one is selected
51
  if tag_filter and tag_filter != "None" and tag_filter in col_map:
52
  if col_map[tag_filter] in filtered_df.columns:
53
  filtered_df = filtered_df[filtered_df[col_map[tag_filter]]]
54
-
55
- # Skip specified organizations if any are provided
56
- if skip_orgs and len(skip_orgs) > 0 and "organization" in filtered_df.columns:
57
- filtered_df = filtered_df[~filtered_df["organization"].isin(skip_orgs)]
58
 
59
  if filtered_df.empty:
60
  return pd.DataFrame()
61
 
62
- # Ensure the metric column is numeric
63
  if count_by not in filtered_df.columns:
64
  filtered_df[count_by] = 0.0
65
  filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
66
 
67
- # Group by organization and find the top K based on the selected metric
68
- org_totals = filtered_df.groupby("organization")[count_by].sum().nlargest(top_k, keep='first')
69
- top_orgs_list = org_totals.index.tolist()
 
 
 
 
 
70
 
71
- # Prepare the final data structure for the treemap
72
- treemap_data = filtered_df[filtered_df["organization"].isin(top_orgs_list)][["id", "organization", count_by]].copy()
73
- treemap_data["root"] = "datasets" # Set the root node for the treemap
 
 
 
 
 
 
 
 
 
 
74
  return treemap_data
75
 
 
76
  def create_treemap(treemap_data, count_by, title=None):
77
- """Generate the Plotly treemap figure."""
78
- if treemap_data.empty:
79
- # Create a placeholder figure if no data matches the filters
80
  fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
81
  fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
82
  return fig
83
 
84
- # Create the main treemap
85
- fig = px.treemap(treemap_data, path=["root", "organization", "id"], values=count_by,
86
  title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
87
  fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
88
  fig.update_traces(
@@ -101,7 +107,6 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
101
 
102
  with gr.Row():
103
  with gr.Column(scale=1):
104
- # --- Control Panel ---
105
  count_by_dropdown = gr.Dropdown(
106
  label="Metric",
107
  choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")],
@@ -120,9 +125,10 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
120
  value=25
121
  )
122
 
123
- skip_orgs_textbox = gr.Textbox(
124
- label="Organizations to Skip (comma-separated)",
125
- value="huggingface,google,facebook,microsoft,amazon"
 
126
  )
127
 
128
  generate_plot_button = gr.Button(
@@ -132,30 +138,24 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
132
  )
133
 
134
  with gr.Column(scale=3):
135
- # --- Output Area ---
136
  plot_output = gr.Plot()
137
  status_message_md = gr.Markdown("Initializing...")
138
  data_info_md = gr.Markdown("")
139
 
140
- # --- Controller Functions ---
141
  def _update_button_interactivity(is_loaded_flag):
142
- """Enable the generate button once data is loaded."""
143
  return gr.update(interactive=is_loaded_flag)
144
 
145
  def ui_load_data_controller(progress=gr.Progress()):
146
- """Handles the initial data loading and updates the UI with status."""
147
  progress(0, desc=f"Loading dataset '{HF_DATASET_ID}'...")
148
  try:
149
  current_df, load_success_flag, status_msg_from_load = load_datasets_data()
150
  if load_success_flag:
151
  progress(0.9, desc="Processing data...")
152
- # Format the timestamp for display
153
  date_display = "Pre-processed (date unavailable)"
154
  if 'data_download_timestamp' in current_df.columns and pd.notna(current_df['data_download_timestamp'].iloc[0]):
155
  ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
156
  date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
157
 
158
- # Create the data information summary
159
  data_info_text = (
160
  f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
161
  f"- Status: {status_msg_from_load}\n"
@@ -174,55 +174,51 @@ with gr.Blocks(title="🤗 Dataverse Explorer", fill_width=True) as demo:
174
 
175
  return current_df, load_success_flag, data_info_text, status_msg_ui
176
 
 
177
  def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
178
- skip_orgs_input, df_current_datasets, progress=gr.Progress()):
179
- """Handles the plot generation based on user inputs."""
180
  if df_current_datasets is None or df_current_datasets.empty:
181
  return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded."
182
 
183
- progress(0.1, desc="Preparing data...")
184
- orgs_to_skip = [org.strip() for org in skip_orgs_input.split(',') if org.strip()]
185
 
186
- # Prepare data for the treemap
187
- treemap_df = make_treemap_data(df_current_datasets, metric_choice, k_orgs, tag_choice, orgs_to_skip)
188
 
189
  progress(0.7, desc="Generating plot...")
190
- # Create a user-friendly title for the chart
191
  title_labels = {"downloads": "Downloads (last 30 days)", "downloadsAllTime": "Downloads (All Time)", "likes": "Likes"}
192
  chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
193
  plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
194
 
195
- # Generate summary statistics for the plot
196
  if treemap_df.empty:
197
  plot_stats_md = "No data matches the selected filters. Please try different options."
198
  else:
199
- total_items_in_plot = len(treemap_df['id'].unique())
200
  total_value_in_plot = treemap_df[metric_choice].sum()
201
- plot_stats_md = f"## Plot Statistics\n- **Datasets shown**: {total_items_in_plot:,}\n- **Total {metric_choice}**: {int(total_value_in_plot):,}"
 
 
 
202
 
203
  return plotly_fig, plot_stats_md
204
 
205
- # --- Event Wiring ---
206
-
207
- # When the app loads, trigger the data fetching process
208
  demo.load(
209
  fn=ui_load_data_controller,
210
  inputs=[],
211
  outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md]
212
  )
213
 
214
- # When the loading is complete, enable the "Generate Plot" button
215
  loading_complete_state.change(
216
  fn=_update_button_interactivity,
217
  inputs=loading_complete_state,
218
  outputs=generate_plot_button
219
  )
220
 
221
- # When the "Generate Plot" button is clicked, trigger the plot generation
222
  generate_plot_button.click(
223
  fn=ui_generate_plot_controller,
224
  inputs=[count_by_dropdown, tag_filter_dropdown, top_k_dropdown,
225
- skip_orgs_textbox, datasets_data_state],
226
  outputs=[plot_output, status_message_md]
227
  )
228
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
 
6
 
7
  # --- Constants ---
8
  TOP_K_CHOICES = list(range(5, 51, 5))
9
+ HF_DATASET_ID = "evijit/dataverse_daily_data"
10
  TAG_FILTER_CHOICES = [
11
  "None", "Audio & Speech", "Time series", "Robotics", "Music",
12
  "Video", "Images", "Text", "Biomedical", "Sciences"
 
17
  start_time = time.time()
18
  print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
19
  try:
 
20
  dataset_dict = load_dataset(HF_DATASET_ID)
 
21
  df = dataset_dict[list(dataset_dict.keys())[0]].to_pandas()
 
 
22
  msg = f"Successfully loaded dataset in {time.time() - start_time:.2f}s."
23
  print(msg)
24
  return df, True, msg
 
27
  print(err_msg)
28
  return pd.DataFrame(), False, err_msg
29
 
30
+ # --- MODIFIED: Core logic to create the "Other" category ---
31
+ def make_treemap_data(df, count_by, top_k=25, tag_filter=None, skip_cats=None):
32
+ """
33
+ Filter data and prepare it for the treemap, grouping smaller organizations
34
+ into an "Other" category.
35
+ """
36
  if df is None or df.empty:
37
  return pd.DataFrame()
38
 
39
  filtered_df = df.copy()
40
 
 
41
  col_map = {
42
  "Audio & Speech": "is_audio_speech", "Music": "has_music", "Robotics": "has_robot",
43
  "Biomedical": "is_biomed", "Time series": "has_series", "Sciences": "has_science",
44
  "Video": "has_video", "Images": "has_image", "Text": "has_text"
45
  }
46
 
 
47
  if tag_filter and tag_filter != "None" and tag_filter in col_map:
48
  if col_map[tag_filter] in filtered_df.columns:
49
  filtered_df = filtered_df[filtered_df[col_map[tag_filter]]]
 
 
 
 
50
 
51
  if filtered_df.empty:
52
  return pd.DataFrame()
53
 
 
54
  if count_by not in filtered_df.columns:
55
  filtered_df[count_by] = 0.0
56
  filtered_df[count_by] = pd.to_numeric(filtered_df[count_by], errors='coerce').fillna(0.0)
57
 
58
+ # 1. Calculate totals for ALL organizations
59
+ all_org_totals = filtered_df.groupby("organization")[count_by].sum()
60
+
61
+ # 2. Identify the Top N organizations
62
+ top_org_totals = all_org_totals.nlargest(top_k, keep='first')
63
+
64
+ # 3. Calculate the sum for the "Other" category
65
+ other_total = all_org_totals.sum() - top_org_totals.sum()
66
 
67
+ # 4. Create the aggregated DataFrame for the plot
68
+ treemap_data = top_org_totals.reset_index()
69
+
70
+ # 5. Add the "Other" row if its value is greater than zero
71
+ if other_total > 0:
72
+ other_row = pd.DataFrame([{'organization': 'Other', count_by: other_total}])
73
+ treemap_data = pd.concat([treemap_data, other_row], ignore_index=True)
74
+
75
+ # 6. Apply the skip filter at the end (e.g., to hide the "Other" category)
76
+ if skip_cats and len(skip_cats) > 0:
77
+ treemap_data = treemap_data[~treemap_data["organization"].isin(skip_cats)]
78
+
79
+ treemap_data["root"] = "datasets"
80
  return treemap_data
81
 
82
+ # --- MODIFIED: Simplified path for the treemap ---
83
  def create_treemap(treemap_data, count_by, title=None):
84
+ """Generate the Plotly treemap figure from aggregated data."""
85
+ if treemap_data.empty or treemap_data[count_by].sum() == 0:
 
86
  fig = px.treemap(names=["No data matches filters"], parents=[""], values=[1])
87
  fig.update_layout(title="No data matches the selected filters", margin=dict(t=50, l=25, r=25, b=25))
88
  return fig
89
 
90
+ # The path is now simpler as we are not showing individual dataset IDs
91
+ fig = px.treemap(treemap_data, path=["root", "organization"], values=count_by,
92
  title=title, color_discrete_sequence=px.colors.qualitative.Plotly)
93
  fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
94
  fig.update_traces(
 
107
 
108
  with gr.Row():
109
  with gr.Column(scale=1):
 
110
  count_by_dropdown = gr.Dropdown(
111
  label="Metric",
112
  choices=[("Downloads (last 30 days)", "downloads"), ("Downloads (All Time)", "downloadsAllTime"), ("Likes", "likes")],
 
125
  value=25
126
  )
127
 
128
+ # --- MODIFIED: UI updated to reflect the new functionality ---
129
+ skip_cats_textbox = gr.Textbox(
130
+ label="Categories to Skip (e.g., Other)",
131
+ value="Other"
132
  )
133
 
134
  generate_plot_button = gr.Button(
 
138
  )
139
 
140
  with gr.Column(scale=3):
 
141
  plot_output = gr.Plot()
142
  status_message_md = gr.Markdown("Initializing...")
143
  data_info_md = gr.Markdown("")
144
 
 
145
  def _update_button_interactivity(is_loaded_flag):
 
146
  return gr.update(interactive=is_loaded_flag)
147
 
148
  def ui_load_data_controller(progress=gr.Progress()):
 
149
  progress(0, desc=f"Loading dataset '{HF_DATASET_ID}'...")
150
  try:
151
  current_df, load_success_flag, status_msg_from_load = load_datasets_data()
152
  if load_success_flag:
153
  progress(0.9, desc="Processing data...")
 
154
  date_display = "Pre-processed (date unavailable)"
155
  if 'data_download_timestamp' in current_df.columns and pd.notna(current_df['data_download_timestamp'].iloc[0]):
156
  ts = pd.to_datetime(current_df['data_download_timestamp'].iloc[0], utc=True)
157
  date_display = ts.strftime('%B %d, %Y, %H:%M:%S %Z')
158
 
 
159
  data_info_text = (
160
  f"### Data Information\n- Source: `{HF_DATASET_ID}`\n"
161
  f"- Status: {status_msg_from_load}\n"
 
174
 
175
  return current_df, load_success_flag, data_info_text, status_msg_ui
176
 
177
+ # --- MODIFIED: Updated controller to handle new logic and stats ---
178
  def ui_generate_plot_controller(metric_choice, tag_choice, k_orgs,
179
+ skip_cats_input, df_current_datasets, progress=gr.Progress()):
 
180
  if df_current_datasets is None or df_current_datasets.empty:
181
  return create_treemap(pd.DataFrame(), metric_choice), "Dataset data is not loaded."
182
 
183
+ progress(0.1, desc="Aggregating data...")
184
+ cats_to_skip = [cat.strip() for cat in skip_cats_input.split(',') if cat.strip()]
185
 
186
+ treemap_df = make_treemap_data(df_current_datasets, metric_choice, k_orgs, tag_choice, cats_to_skip)
 
187
 
188
  progress(0.7, desc="Generating plot...")
 
189
  title_labels = {"downloads": "Downloads (last 30 days)", "downloadsAllTime": "Downloads (All Time)", "likes": "Likes"}
190
  chart_title = f"HuggingFace Datasets - {title_labels.get(metric_choice, metric_choice)} by Organization"
191
  plotly_fig = create_treemap(treemap_df, metric_choice, chart_title)
192
 
193
+ # Update plot statistics to be more accurate for the new view
194
  if treemap_df.empty:
195
  plot_stats_md = "No data matches the selected filters. Please try different options."
196
  else:
 
197
  total_value_in_plot = treemap_df[metric_choice].sum()
198
+ plot_stats_md = (
199
+ f"## Plot Statistics\n- **Top Categories Shown**: {len(treemap_df):,}\n"
200
+ f"- **Total {metric_choice} in plot**: {int(total_value_in_plot):,}"
201
+ )
202
 
203
  return plotly_fig, plot_stats_md
204
 
205
+ # --- Event Wiring (no changes needed here) ---
 
 
206
  demo.load(
207
  fn=ui_load_data_controller,
208
  inputs=[],
209
  outputs=[datasets_data_state, loading_complete_state, data_info_md, status_message_md]
210
  )
211
 
 
212
  loading_complete_state.change(
213
  fn=_update_button_interactivity,
214
  inputs=loading_complete_state,
215
  outputs=generate_plot_button
216
  )
217
 
 
218
  generate_plot_button.click(
219
  fn=ui_generate_plot_controller,
220
  inputs=[count_by_dropdown, tag_filter_dropdown, top_k_dropdown,
221
+ skip_cats_textbox, datasets_data_state],
222
  outputs=[plot_output, status_message_md]
223
  )
224