Spaces:
Running
Running
| from functools import partial, lru_cache | |
| import duckdb | |
| import gradio as gr | |
| import json | |
| import pandas as pd | |
| import pyarrow as pa | |
| import pyarrow.compute as pc | |
| import requests | |
| from huggingface_hub import HfApi | |
| READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet") | |
| EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10) | |
| EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas() | |
| NUM_ROWS = 20 | |
| MAX_NUM_COLUMNS = 20 | |
| NUM_TRENDING_DATASETS = 10 | |
| NUM_USER_DATASETS = 10 | |
| css = """ | |
| .transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion { | |
| background: var(--body-background-fill); | |
| } | |
| input { | |
| -webkit-user-select: none; | |
| -moz-user-select: none; | |
| -ms-user-select: none; | |
| user-select: none; | |
| } | |
| .cell-menu-button { | |
| z-index: -1; | |
| } | |
| thead { | |
| display: none; | |
| } | |
| .secondary-wrap:has(input[aria-expanded="true"]) { | |
| background: var(--table-odd-background-fill); | |
| } | |
| .secondary-wrap:has(input[aria-expanded="true"])::after { | |
| content: 'β΅'; | |
| margin-right: var(--size-10); | |
| border-width: 1px; | |
| border-color: var(--block-border-color); | |
| border-radius: .23rem; | |
| background-color: #141c2e; | |
| padding-left: 2px; | |
| font-size: .75rem; | |
| color: var(--block-title-text-color); | |
| } | |
| .gradio-container { | |
| padding: var(--size-4) 0 !important; | |
| max-width: 98% !important; | |
| } | |
| """ | |
| js = """ | |
| function load() { | |
| // Set DataFrame readonly | |
| MutationObserver = window.MutationObserver || window.WebKitMutationObserver; | |
| var observer = new MutationObserver(function(mutations, observer) { | |
| // fired when a mutation occurs | |
| document.querySelectorAll('.readonly-dataframe div .table-wrap button svelte-virtual-table-viewport table tbody tr td .cell-wrap input').forEach(i => i.setAttribute("readonly", "true")); | |
| }); | |
| // define what element should be observed by the observer | |
| // and what types of mutations trigger the callback | |
| observer.observe(document, { | |
| subtree: true, | |
| childList: true | |
| }); | |
| // Run query on Enter in transform dropdown | |
| document.querySelectorAll("input").forEach(i => { | |
| if (i.parentElement.parentElement.parentElement.parentElement.parentElement.classList.contains("transform_dropdown")) { | |
| i.onkeydown = (event) => { | |
| if (event.code == "Enter") { | |
| document.getElementById("run_button").click(); | |
| } | |
| } | |
| } | |
| }) | |
| } | |
| """ | |
| text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t") | |
| date_functions_df = pd.read_csv("date_functions.tsv", delimiter="\t") | |
| list_functions_df = pd.read_csv("list_functions.tsv", delimiter="\t") | |
| numeric_functions_df = pd.read_csv("numeric_functions.tsv", delimiter="\t") | |
| time_functions_df = pd.read_csv("time_functions.tsv", delimiter="\t") | |
| timestamp_functions_df = pd.read_csv("timestamp_functions.tsv", delimiter="\t") | |
| def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation: | |
| return duckdb.sql(query) | |
| def prepare_function(func: str, placeholders: list[str], column_name: str) -> str: | |
| prepared_func = func.split("(", 1) | |
| for placeholder in placeholders: | |
| if placeholder in prepared_func[-1]: | |
| prepared_func[-1] = prepared_func[-1].replace(placeholder, column_name, 1) | |
| return "(".join(prepared_func) | |
| else: | |
| return None | |
| def prettify_df(df: pd.DataFrame): | |
| return df.apply(lambda s: s.apply(str)) | |
| def get_prepared_functions_from_table(table: pa.Table) -> dict[str, list[str]]: | |
| prepared_functions = {} | |
| for field in table.schema: | |
| if pa.types.is_integer(field.type) or pa.types.is_floating(field.type): | |
| prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name] | |
| elif pa.types.is_string(field.type): | |
| prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name] | |
| # try parsing json | |
| if pc.all(pc.starts_with(table[field.name], "{")).as_py() or pc.all(pc.starts_with(table[field.name], "[")).as_py(): | |
| try: | |
| json_parsed_table = pa.Table.from_pylist([{field.name: json.loads(row)} for row in table[field.name].to_pylist()]) | |
| parsed_type = str(duckdb.from_arrow(json_parsed_table).dtypes[0]) | |
| prepared_functions[field.name] = [f"CAST({field.name} as {parsed_type})"] + prepared_functions[field.name] | |
| except Exception: | |
| pass | |
| elif pa.types.is_date(field.type): | |
| prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name] | |
| elif pa.types.is_list(field.type): | |
| prepared_functions[field.name] = [prepare_function(list_func, ["list"], field.name) for list_func in list_functions_df.Name] | |
| elif pa.types.is_time(field.type): | |
| prepared_functions[field.name] = [prepare_function(time_func, ["starttime", "time"], field.name) for time_func in time_functions_df.Name] | |
| elif pa.types.is_timestamp(field.type): | |
| prepared_functions[field.name] = [prepare_function(timestamp_func, ["startdate", "timestamp"], field.name) for timestamp_func in timestamp_functions_df.Name] | |
| elif pa.types.is_struct(field.type): | |
| prepared_functions[field.name] = [f"{field.name}.{subfield.name}" for subfield in field.type.fields] | |
| else: | |
| prepared_functions[field.name] = [] | |
| prepared_functions[field.name] = [prepared_function for prepared_function in prepared_functions[field.name] if prepared_function] | |
| return prepared_functions | |
| with gr.Blocks(css=css, js=js) as demo: | |
| loading_codes_json = gr.JSON(visible=False) | |
| dataset_subset_split_textbox = gr.Textbox(visible=False) | |
| input_table_state = gr.State() | |
| run_button = gr.Button(visible=False, elem_id="run_button") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("# <p style='text-align:center;'>π₯ DuckDB Spreadsheets π</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>) using DuckDB functions (documentation <a href='https://duckdb.org/docs/sql/functions/overview' target='_blank'>here</a>)</p>") | |
| with gr.Group(): | |
| with gr.Row(): | |
| dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10) | |
| subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") | |
| split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") | |
| gr.LoginButton() | |
| with gr.Row(): | |
| transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns] | |
| transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))] | |
| dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe") | |
| with gr.Accordion("Show DuckDB SQL command", open=False, elem_classes="transparent-accordion"): | |
| code_markdown = gr.Markdown() | |
| def show_subset_dropdown(dataset: str): | |
| if dataset and "/" not in dataset.strip().strip("/"): | |
| return [] | |
| resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json() | |
| loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or [] | |
| subsets = [loading_code["config_name"] for loading_code in loading_codes] | |
| subset = (subsets or [""])[0] | |
| return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes | |
| def show_split_dropdown(subset: str, loading_codes: list[dict]): | |
| splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] | |
| split = (splits or [""])[0] | |
| return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset)) | |
| def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]): | |
| pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
| if dataset and subset and split and pattern: | |
| table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS}").arrow() | |
| else: | |
| table = EMPTY_TABLE | |
| prepared_functions = get_prepared_functions_from_table(table) | |
| new_transform_dropdowns = [dict(choices=[column_name] + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in table.column_names] | |
| new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))] | |
| df = table.to_pandas() | |
| return [table, dict(value=prettify_df(df), column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns | |
| def set_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], input_table: pa.Table, df: pd.DataFrame, *transforms, show_warning=True): | |
| try: | |
| table = duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_table;").arrow() | |
| except Exception as e: | |
| if show_warning: | |
| gr.Warning(f"{type(e).__name__}: {e}") | |
| return { | |
| dataframe: df | |
| } | |
| prepared_functions = get_prepared_functions_from_table(table) | |
| new_transform_dropdowns = [dict(choices=list({original_column_name: None, column_name: None}) + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for original_column_name, column_name in zip(input_table.column_names, table.column_names)] | |
| new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))] | |
| pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
| return { | |
| dataframe: prettify_df(table.to_pandas()), | |
| **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])), | |
| code_markdown: ( | |
| "```sql\n" | |
| + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} " | |
| + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};" | |
| + "\n```" | |
| ) if pattern else "", | |
| } | |
| for column_index, transform_dropdown in enumerate(transform_dropdowns): | |
| transform_dropdown.select(partial(set_dataframe, show_warning=False), inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns) | |
| run_button.click(set_dataframe, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns) | |
| def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None): | |
| api = HfApi(token=oauth_token.token if oauth_token else None) | |
| datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"])) | |
| if oauth_token and (user := api.whoami().get("name")): | |
| datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user)) | |
| dataset = request.query_params.get("dataset") or datasets[0].id | |
| subsets, loading_codes = show_subset_dropdown(dataset) | |
| splits = show_split_dropdown(subsets["value"], loading_codes) | |
| input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes) | |
| pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0] | |
| return { | |
| dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset), | |
| loading_codes_json: loading_codes, | |
| subset_dropdown: gr.Dropdown(**subsets), | |
| split_dropdown: gr.Dropdown(**splits), | |
| input_table_state: input_table, | |
| dataframe: gr.DataFrame(**input_dataframe), | |
| **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])), | |
| code_markdown: ( | |
| "```sql\n" | |
| + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} " | |
| + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};" | |
| + "\n```" | |
| ) if pattern else "", | |
| } | |
| def _show_subset_dropdown(dataset: str): | |
| subsets, loading_codes = show_subset_dropdown(dataset) | |
| splits = show_split_dropdown(subsets["value"], loading_codes) | |
| input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes) | |
| pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0] | |
| return { | |
| loading_codes_json: loading_codes, | |
| subset_dropdown: gr.Dropdown(**subsets), | |
| split_dropdown: gr.Dropdown(**splits), | |
| input_table_state: input_table, | |
| dataframe: gr.DataFrame(**input_dataframe), | |
| **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])), | |
| code_markdown: ( | |
| "```sql\n" | |
| + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} " | |
| + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};" | |
| + "\n```" | |
| ) if pattern else "", | |
| } | |
| def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]): | |
| splits = show_split_dropdown(subset, loading_codes) | |
| input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes) | |
| pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
| return { | |
| split_dropdown: gr.Dropdown(**splits), | |
| input_table_state: input_table, | |
| dataframe: gr.DataFrame(**input_dataframe), | |
| **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])), | |
| code_markdown: ( | |
| "```sql\n" | |
| + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} " | |
| + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};" | |
| + "\n```" | |
| ) if pattern else "", | |
| } | |
| def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame: | |
| input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes) | |
| pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
| return { | |
| input_table_state: input_table, | |
| dataframe: gr.DataFrame(**input_dataframe), | |
| **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])), | |
| code_markdown: ( | |
| "```sql\n" | |
| + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} " | |
| + f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};" | |
| + "\n```" | |
| ) if pattern else "", | |
| } | |
| if __name__ == "__main__": | |
| demo.launch() | |