Spaces:
Running
on
Zero
Running
on
Zero
| import datasets | |
| import polars as pl | |
| BASE_REPO_ID = "ai-conferences/NeurIPS2025" | |
| PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim" | |
| def format_author_claim_ratio(row: dict) -> str: | |
| n_linked_authors = row["n_linked_authors"] | |
| n_authors = row["n_authors"] | |
| if n_linked_authors is None or n_authors is None: | |
| return "" | |
| author_linked = "✅" if n_linked_authors > 0 else "" | |
| return f"{n_linked_authors}/{n_authors} {author_linked}".strip() | |
| df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars() | |
| df_paper_page = ( | |
| datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train") | |
| .to_polars() | |
| .drop(["summary", "author_names", "ai_keywords"]) | |
| ) | |
| df_orig = ( | |
| df_orig.join(df_paper_page, on="arxiv_id", how="left", suffix="_2") | |
| .with_columns( | |
| [ | |
| pl.when(pl.col("github_2").is_not_null()) | |
| .then(pl.col("github_2")) | |
| .otherwise(pl.col("github")) | |
| .alias("github") | |
| ] | |
| ) | |
| .drop(["github_2"]) | |
| ) | |
| # format authors | |
| df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str")) | |
| # format links | |
| df_orig = df_orig.with_columns( | |
| [pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md") for col in ["project_page", "github"]] | |
| ) | |
| # format paper page link | |
| df_orig = df_orig.with_columns( | |
| (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page") | |
| ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md")) | |
| # count authors | |
| df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors")) | |
| df_orig = df_orig.with_columns( | |
| pl.col("author_usernames") | |
| .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64) | |
| .alias("n_linked_authors") | |
| ) | |
| df_orig = df_orig.with_columns( | |
| pl.struct(["n_linked_authors", "n_authors"]) | |
| .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8) | |
| .alias("claimed") | |
| ) | |
| # TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002 | |
| # format numbers as strings | |
| df_orig = df_orig.with_columns( | |
| [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]] | |
| ) | |
| # format spaces, models, datasets | |
| for repo_id_col, markdown_col, base_url in [ | |
| ("space_ids", "Spaces", "https://huggingface.co/spaces/"), | |
| ("model_ids", "Models", "https://huggingface.co/"), | |
| ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"), | |
| ]: | |
| df_orig = df_orig.with_columns( | |
| pl.col(repo_id_col) | |
| .map_elements( | |
| lambda lst: "\n".join([f"[{x}]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023 | |
| return_dtype=pl.Utf8, | |
| ) | |
| .fill_null("") | |
| .alias(markdown_col) | |
| ) | |