Spaces:
Runtime error
Runtime error
| ## LIBRARIES ### | |
| ## Data | |
| import pandas as pd | |
| pd.options.display.float_format = '${:,.2f}'.format | |
| # Analysis | |
| # App & Visualization | |
| import streamlit as st | |
| from bokeh.models import CustomJS, ColumnDataSource, TextInput, DataTable, TableColumn | |
| from bokeh.plotting import figure | |
| from bokeh.transform import factor_cmap | |
| from bokeh.palettes import Category20c_20 | |
| from bokeh.layouts import column, row | |
| # utils | |
| def datasets_explorer_viz(df): | |
| s = ColumnDataSource(df) | |
| TOOLTIPS= [("dataset_id", "@dataset_id"), ("task", "@task")] | |
| color = factor_cmap('task', palette=Category20c_20, factors=df['task'].unique()) | |
| p = figure(plot_width=1000, plot_height=800, tools="hover,wheel_zoom,pan,box_select", tooltips=TOOLTIPS, toolbar_location="above") | |
| p.scatter('x', 'y', size=5, source=s, alpha=0.8,marker='circle',fill_color = color, line_color=color, legend_field = 'task') | |
| p.legend.location = "bottom_right" | |
| p.legend.click_policy="mute" | |
| p.legend.label_text_font_size="8pt" | |
| table_source = ColumnDataSource(data=dict()) | |
| selection_source = ColumnDataSource(data=dict()) | |
| columns = [ | |
| # TableColumn(field="x", title="X data"), | |
| # TableColumn(field="y", title="Y data"), | |
| TableColumn(field="task", title="Task"), | |
| TableColumn(field="dataset_id", title="Dataset ID"), | |
| ] | |
| data_table = DataTable(source=table_source, columns=columns, width=350) | |
| p.circle('x', 'y',source=selection_source, size=5, color= 'red') | |
| s.selected.js_on_change('indices', CustomJS(args=dict(umap_source=s, table_source=table_source), code=""" | |
| const inds = cb_obj.indices; | |
| const tableData = table_source.data; | |
| const umapData = umap_source.data; | |
| tableData['task'] = [] | |
| tableData['dataset_id'] = [] | |
| for (let i = 0; i < inds.length; i++) { | |
| tableData['task'].push(umapData['task'][inds[i]]) | |
| tableData['dataset_id'].push(umapData['dataset_id'][inds[i]]) | |
| } | |
| table_source.data = tableData; | |
| table_source.change.emit(); | |
| """ | |
| )) | |
| text_input = TextInput(value="", title="Search") | |
| #text_input.on_change("value_input", | |
| text_input.js_on_change('value', CustomJS(args=dict(plot_source=s, selection_source=selection_source), code=""" | |
| const plot_data = plot_source.data; | |
| const selectData = selection_source.data | |
| const value = cb_obj.value | |
| selectData['x'] = [] | |
| selectData['y'] = [] | |
| selectData['dataset_id'] = [] | |
| selectData['task'] = [] | |
| for (var i = 0; i < plot_data['dataset_id'].length; i++) { | |
| if (plot_data['dataset_id'][i].includes(value) || plot_data['task'][i].includes(value)) { | |
| selectData['x'].push(plot_data['x'][i]) | |
| selectData['y'].push(plot_data['y'][i]) | |
| selectData['dataset_id'].push(plot_data['dataset_id'][i]) | |
| selectData['task'].push(plot_data['task'][i]) | |
| } | |
| } | |
| selection_source.change.emit() | |
| """)) | |
| st.bokeh_chart(row(column(text_input,p), data_table)) | |
| if __name__ == "__main__": | |
| ### STREAMLIT APP CONGFIG ### | |
| st.set_page_config(layout="wide", page_title="Datasets Explorer") | |
| st.title('Interactive Datasets Explorer') | |
| #lcol, rcol = st.columns([2, 2]) | |
| # ******* loading the mode and the data | |
| ### LOAD DATA AND SESSION VARIABLES ### | |
| with st.expander("How to interact with the plot:"): | |
| st.markdown("* Each point in the plot represents a HF hub dataset categorized by their `task_id`.") | |
| st.markdown("* Every dataset is embedded using the [SPECTER](https://github.com/allenai/specter#advanced-training-your-own-model) embedding of its corresponding paper abstract.") | |
| st.markdown("* You can either search for a dataset or drag and select to peek into the cluster content.") | |
| datasets_df = pd.read_parquet('./assets/data/datasets_df.parquet') | |
| st.warning("Hugging Face π€ Datasets Explorer") | |
| datasets_explorer_viz(datasets_df) |