Spaces:

ai-conferences
/

NeurIPS2025

Running on Zero

App Files Files Community

hysts HF Staff commited on 11 days ago

Commit

6f1aa4e

1 Parent(s): 95c535e

Add files

Browse files

Files changed (14) hide show

.gitignore +162 -0
.pre-commit-config.yaml +33 -0
.python-version +1 -0
.vscode/extensions.json +8 -0
.vscode/settings.json +17 -0
README.md +3 -3
app.py +204 -0
app_mcp.py +127 -0
pyproject.toml +56 -0
requirements.txt +365 -0
search.py +30 -0
style.css +4 -0
table.py +82 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+.gradio/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-toml
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.2
+    hooks:
+      - id: ruff-check
+        args: ["--fix"]
+      - id: ruff-format
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.18.2
+    hooks:
+      - id: mypy
+        args: ["--ignore-missing-imports"]
+        additional_dependencies:
+          [
+            "types-python-slugify",
+            "types-pytz",
+            "types-PyYAML",
+            "types-requests",
+          ]

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

.vscode/extensions.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "recommendations": [
+        "ms-python.python",
+        "charliermarsh.ruff",
+        "streetsidesoftware.code-spell-checker",
+        "tamasfe.even-better-toml"
+    ]
+}

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "editor.formatOnSave": true,
+    "files.insertFinalNewline": false,
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff",
+        "editor.formatOnType": true,
+        "editor.codeActionsOnSave": {
+            "source.fixAll.ruff": "explicit",
+            "source.organizeImports": "explicit"
+        }
+    },
+    "[jupyter]": {
+        "files.insertFinalNewline": false
+    },
+    "notebook.output.scrolling": true,
+    "notebook.formatOnSave.enabled": true
+}

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: NeurIPS2025
-emoji: 🐠
 colorFrom: red
-colorTo: gray
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py

 ---
+title: NeurIPS 2025
+emoji: ⚡
 colorFrom: red
+colorTo: purple
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+#!/usr/bin/env python
+import gradio as gr
+import polars as pl
+from gradio_modal import Modal
+from app_mcp import demo as demo_mcp
+from search import search
+from table import df_orig
+DESCRIPTION = "# NeurIPS 2025"
+df_main = df_orig.select(
+    "title",
+    "authors_str",
+    "paper_page_md",
+    "upvotes",
+    "num_comments",
+    "project_page_md",
+    "github_md",
+    "Spaces",
+    "Models",
+    "Datasets",
+    "claimed",
+    "abstract",
+    "paper_id",
+)
+# TODO: remove this once https://github.com/gradio-app/gradio/issues/10916 https://github.com/gradio-app/gradio/issues/11001 https://github.com/gradio-app/gradio/issues/11002 are fixed  # noqa: TD002, FIX002
+df_main = df_main.with_columns(
+    [
+        pl.when(pl.col(col) == "").then(None).otherwise(pl.col(col)).cast(pl.Int64).fill_null(0).alias(col)
+        for col in ["upvotes", "num_comments"]
+    ]
+)
+df_main = df_main.rename(
+    {
+        "title": "Title",
+        "authors_str": "Authors",
+        "paper_page_md": "Paper page",
+        "upvotes": "👍",
+        "num_comments": "💬",
+        "project_page_md": "Project page",
+        "github_md": "GitHub",
+    }
+)
+COLUMN_INFO = {
+    "Title": ("str", "40%"),
+    "Authors": ("str", "20%"),
+    "Paper page": ("markdown", "135px"),
+    "👍": ("number", "50px"),
+    "💬": ("number", "50px"),
+    "Project page": ("markdown", None),
+    "GitHub": ("markdown", None),
+    "Spaces": ("markdown", None),
+    "Models": ("markdown", None),
+    "Datasets": ("markdown", None),
+    "claimed": ("markdown", None),
+}
+DEFAULT_COLUMNS = [
+    "Title",
+    "Paper page",
+    "👍",
+    "💬",
+    "Project page",
+    "GitHub",
+    "Spaces",
+    "Models",
+    "Datasets",
+]
+def update_num_papers(df: pl.DataFrame) -> str:
+    if "claimed" in df.columns:
+        return f"{len(df)} / {len(df_main)} ({df.select(pl.col('claimed').str.contains('✅').sum()).item()} claimed)"
+    return f"{len(df)} / {len(df_main)}"
+def update_df(
+    search_query: str,
+    candidate_pool_size: int,
+    num_results: int,
+    column_names: list[str],
+) -> gr.Dataframe:
+    if num_results > candidate_pool_size:
+        raise gr.Error("Number of results must be less than or equal to candidate pool size", print_exception=False)
+    df = df_main.clone()
+    column_names = ["Title", *column_names]
+    if search_query:
+        results = search(search_query, candidate_pool_size, num_results)
+        if not results:
+            df = df.head(0)
+        else:
+            df = pl.DataFrame(results).join(df, on="paper_id", how="inner")
+            df = df.sort("ce_score", descending=True).drop("ce_score")
+    sorted_column_names = [col for col in COLUMN_INFO if col in column_names]
+    df = df.select(sorted_column_names)
+    return gr.Dataframe(
+        value=df,
+        datatype=[COLUMN_INFO[col][0] for col in sorted_column_names],
+        column_widths=[COLUMN_INFO[col][1] for col in sorted_column_names],
+    )
+def df_row_selected(
+    evt: gr.SelectData,
+) -> tuple[
+    Modal,
+    gr.Textbox,  # title
+    gr.Textbox,  # abstract
+]:
+    if evt.index[1] != 0:
+        return Modal(), gr.Textbox(), gr.Textbox()
+    title = evt.row_value[0]
+    row = df_main.filter(pl.col("Title") == title)
+    return (
+        Modal(visible=True),
+        gr.Textbox(value=row["Title"].item()),  # title
+        gr.Textbox(value=row["abstract"].item()),  # abstract
+    )
+with gr.Blocks(css_paths="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Search...")
+    with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
+        with gr.Row():
+            candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=600, step=1, value=200)
+            num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
+    column_names = gr.CheckboxGroup(
+        label="Columns",
+        choices=[col for col in COLUMN_INFO if col != "Title"],
+        value=[col for col in DEFAULT_COLUMNS if col != "Title"],
+    )
+    num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(df_orig), interactive=False)
+    df = gr.Dataframe(
+        value=df_main,
+        datatype=list(COLUMN_INFO.values()),
+        type="polars",
+        row_count=(0, "dynamic"),
+        show_row_numbers=True,
+        interactive=False,
+        max_height=1000,
+        elem_id="table",
+        column_widths=[COLUMN_INFO[col][1] for col in COLUMN_INFO],
+    )
+    with Modal(visible=False, elem_id="abstract-modal") as abstract_modal:
+        title = gr.Textbox(label="Title")
+        abstract = gr.Textbox(label="Abstract")
+    df.select(fn=df_row_selected, outputs=[abstract_modal, title, abstract])
+    inputs = [
+        search_query,
+        candidate_pool_size,
+        num_results,
+        column_names,
+    ]
+    gr.on(
+        triggers=[
+            search_query.submit,
+            column_names.input,
+        ],
+        fn=update_df,
+        inputs=inputs,
+        outputs=df,
+        api_name=False,
+    ).then(
+        fn=update_num_papers,
+        inputs=df,
+        outputs=num_papers,
+        queue=False,
+        api_name=False,
+    )
+    demo.load(
+        fn=update_df,
+        inputs=inputs,
+        outputs=df,
+        api_name=False,
+    ).then(
+        fn=update_num_papers,
+        inputs=df,
+        outputs=num_papers,
+        queue=False,
+        api_name=False,
+    )
+    with gr.Row(visible=False):
+        demo_mcp.render()
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

app_mcp.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio as gr
+import polars as pl
+from search import search
+from table import df_orig
+COLUMNS_MCP = [
+    "title",
+    "authors",
+    "abstract",
+    "arxiv_id",
+    "paper_page",
+    "space_ids",
+    "model_ids",
+    "dataset_ids",
+    "upvotes",
+    "num_comments",
+    "project_page",
+    "github",
+    "row_index",
+]
+DEFAULT_COLUMNS_MCP = [
+    "title",
+    "authors",
+    "abstract",
+    "arxiv_id",
+    "project_page",
+    "github",
+    "row_index",
+]
+df_mcp = df_orig.rename({"paper_id": "row_index"}).select(COLUMNS_MCP)
+def search_papers(
+    search_query: str,
+    candidate_pool_size: int,
+    num_results: int,
+    columns: list[str],
+) -> list[dict]:
+    """Searches NeurIPS 2025 papers relevant to a user query in English.
+    This function performs a semantic search over NeurIPS 2025 papers.
+    It uses a dual-stage retrieval process:
+    - First, it retrieves `candidate_pool_size` papers using dense vector similarity.
+    - Then, it re-ranks them with a cross-encoder model to select the top `num_results` most relevant papers.
+    - The search results are returned as a list of dictionaries.
+    Note:
+        The search query must be written in English. Queries in other languages are not supported.
+    Args:
+        search_query (str): The natural language query input by the user. Must be in English.
+        candidate_pool_size (int): Number of candidate papers to retrieve using the dense vector model.
+        num_results (int): Final number of top-ranked papers to return after re-ranking.
+        columns (list[str]): The columns to select from the DataFrame.
+    Returns:
+        list[dict]: A list of dictionaries of the top-ranked papers matching the query, sorted by relevance.
+    """
+    if not search_query:
+        raise ValueError("Search query cannot be empty")
+    if num_results > candidate_pool_size:
+        raise ValueError("Number of results must be less than or equal to candidate pool size")
+    df = df_mcp.clone()
+    results = search(search_query, candidate_pool_size, num_results)
+    df = pl.DataFrame(results).rename({"paper_id": "row_index"}).join(df, on="row_index", how="inner")
+    df = df.sort("ce_score", descending=True)
+    return df.select(columns).to_dicts()
+def get_metadata(row_index: int) -> dict:
+    """Returns a dictionary of metadata for a NeurIPS 2025 paper at the given table row index.
+    Args:
+        row_index (int): The index of the paper in the internal paper list table.
+    Returns:
+        dict: A dictionary containing metadata for the corresponding paper.
+    """
+    return df_mcp.filter(pl.col("row_index") == row_index).to_dicts()[0]
+def get_table(columns: list[str]) -> list[dict]:
+    """Returns a list of dictionaries of all NeurIPS 2025 papers.
+    Args:
+        columns (list[str]): The columns to select from the DataFrame.
+    Returns:
+        list[dict]: A list of dictionaries of all NeurIPS 2025 papers.
+    """
+    return df_mcp.select(columns).to_dicts()
+with gr.Blocks() as demo:
+    search_query = gr.Textbox(label="Search", submit_btn=True)
+    candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=500, step=1, value=200)
+    num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
+    column_names = gr.CheckboxGroup(label="Columns", choices=COLUMNS_MCP, value=DEFAULT_COLUMNS_MCP)
+    row_index = gr.Slider(label="Row Index", minimum=0, maximum=len(df_mcp) - 1, step=1, value=0)
+    out = gr.JSON()
+    search_papers_btn = gr.Button("Search Papers")
+    get_metadata_btn = gr.Button("Get Metadata")
+    get_table_btn = gr.Button("Get Table")
+    search_papers_btn.click(
+        fn=search_papers,
+        inputs=[search_query, candidate_pool_size, num_results, column_names],
+        outputs=out,
+    )
+    get_metadata_btn.click(
+        fn=get_metadata,
+        inputs=row_index,
+        outputs=out,
+    )
+    get_table_btn.click(
+        fn=get_table,
+        inputs=column_names,
+        outputs=out,
+    )
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,56 @@

+[project]
+name = "neurips2025"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "datasets>=4.3.0",
+    "gradio[mcp]>=5.49.1",
+    "gradio-modal>=0.0.4",
+    "polars>=1.34.0",
+    "sentence-transformers>=5.1.2",
+    "spaces>=0.42.1",
+    "torch==2.8.0",
+    "faiss-cpu>=1.12.0",
+]
+[tool.ruff]
+line-length = 119
+[tool.ruff.lint]
+select = ["ALL"]
+ignore = [
+    "COM812", # missing-trailing-comma
+    "D203",   # one-blank-line-before-class
+    "D213",   # multi-line-summary-second-line
+    "E501",   # line-too-long
+    "SIM117", # multiple-with-statements
+    #
+    "D100",    # undocumented-public-module
+    "D101",    # undocumented-public-class
+    "D102",    # undocumented-public-method
+    "D103",    # undocumented-public-function
+    "D104",    # undocumented-public-package
+    "D105",    # undocumented-magic-method
+    "D107",    # undocumented-public-init
+    "EM101",   # raw-string-in-exception
+    "FBT001",  # boolean-type-hint-positional-argument
+    "FBT002",  # boolean-default-value-positional-argument
+    "PGH003",  # blanket-type-ignore
+    "PLR0913", # too-many-arguments
+    "PLR0915", # too-many-statements
+    "TRY003",  # raise-vanilla-args
+]
+unfixable = [
+    "F401", # unused-import
+]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+[tool.ruff.lint.per-file-ignores]
+"*.ipynb" = ["T201", "T203"]
+[tool.ruff.format]
+docstring-code-format = true

requirements.txt ADDED Viewed

	@@ -0,0 +1,365 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.1
+    # via fsspec
+aiosignal==1.4.0
+    # via aiohttp
+annotated-doc==0.0.2
+    # via fastapi
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.11.0
+    # via
+    #   gradio
+    #   httpx
+    #   mcp
+    #   sse-starlette
+    #   starlette
+async-timeout==5.0.1
+    # via aiohttp
+attrs==25.4.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+brotli==1.1.0
+    # via gradio
+certifi==2025.10.5
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.4
+    # via requests
+click==8.3.0
+    # via
+    #   typer
+    #   uvicorn
+datasets==4.3.0
+    # via neurips2025 (pyproject.toml)
+dill==0.4.0
+    # via
+    #   datasets
+    #   multiprocess
+exceptiongroup==1.3.0
+    # via anyio
+faiss-cpu==1.12.0
+    # via neurips2025 (pyproject.toml)
+fastapi==0.120.0
+    # via gradio
+ffmpy==0.6.4
+    # via gradio
+filelock==3.20.0
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.9.0
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==5.49.1
+    # via
+    #   neurips2025 (pyproject.toml)
+    #   gradio-modal
+    #   spaces
+gradio-client==1.13.3
+    # via gradio
+gradio-modal==0.0.4
+    # via neurips2025 (pyproject.toml)
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.1.10
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   mcp
+    #   safehttpx
+    #   spaces
+httpx-sse==0.4.3
+    # via mcp
+huggingface-hub==0.36.0
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   sentence-transformers
+    #   tokenizers
+    #   transformers
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+joblib==1.5.2
+    # via scikit-learn
+jsonschema==4.25.1
+    # via mcp
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   gradio
+    #   jinja2
+mcp==1.10.1
+    # via gradio
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.7.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+networkx==3.4.2
+    # via torch
+numpy==2.2.6
+    # via
+    #   datasets
+    #   faiss-cpu
+    #   gradio
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   transformers
+nvidia-cublas-cu12==12.8.4.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21
+    # via torch
+nvidia-cufft-cu12==11.3.3.83
+    # via torch
+nvidia-cufile-cu12==1.13.1.3
+    # via torch
+nvidia-curand-cu12==10.3.9.90
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1
+    # via torch
+nvidia-nccl-cu12==2.27.3
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.8.90
+    # via torch
+orjson==3.11.3
+    # via gradio
+packaging==25.0
+    # via
+    #   datasets
+    #   faiss-cpu
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   spaces
+    #   transformers
+pandas==2.3.3
+    # via
+    #   datasets
+    #   gradio
+pillow==11.3.0
+    # via
+    #   gradio
+    #   sentence-transformers
+polars==1.34.0
+    # via neurips2025 (pyproject.toml)
+polars-runtime-32==1.34.0
+    # via polars
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+psutil==5.9.8
+    # via spaces
+pyarrow==21.0.0
+    # via datasets
+pydantic==2.11.10
+    # via
+    #   fastapi
+    #   gradio
+    #   mcp
+    #   pydantic-settings
+    #   spaces
+pydantic-core==2.33.2
+    # via pydantic
+pydantic-settings==2.11.0
+    # via mcp
+pydub==0.25.1
+    # via gradio
+pygments==2.19.2
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.1.1
+    # via pydantic-settings
+python-multipart==0.0.20
+    # via
+    #   gradio
+    #   mcp
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.3
+    # via
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2025.10.23
+    # via transformers
+requests==2.32.5
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   spaces
+    #   transformers
+rich==14.2.0
+    # via typer
+rpds-py==0.28.0
+    # via
+    #   jsonschema
+    #   referencing
+ruff==0.14.2
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.6.2
+    # via transformers
+scikit-learn==1.7.2
+    # via sentence-transformers
+scipy==1.15.3
+    # via
+    #   scikit-learn
+    #   sentence-transformers
+semantic-version==2.10.0
+    # via gradio
+sentence-transformers==5.1.2
+    # via neurips2025 (pyproject.toml)
+setuptools==80.9.0
+    # via triton
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+spaces==0.42.1
+    # via neurips2025 (pyproject.toml)
+sse-starlette==3.0.2
+    # via mcp
+starlette==0.48.0
+    # via
+    #   fastapi
+    #   gradio
+    #   mcp
+sympy==1.14.0
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+tokenizers==0.22.1
+    # via transformers
+tomlkit==0.13.3
+    # via gradio
+torch==2.8.0
+    # via
+    #   neurips2025 (pyproject.toml)
+    #   sentence-transformers
+tqdm==4.67.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   sentence-transformers
+    #   transformers
+transformers==4.57.1
+    # via sentence-transformers
+triton==3.4.0
+    # via torch
+typer==0.20.0
+    # via gradio
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   anyio
+    #   exceptiongroup
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   multidict
+    #   pydantic
+    #   pydantic-core
+    #   referencing
+    #   sentence-transformers
+    #   spaces
+    #   starlette
+    #   torch
+    #   typer
+    #   typing-inspection
+    #   uvicorn
+typing-inspection==0.4.2
+    # via
+    #   pydantic
+    #   pydantic-settings
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via requests
+uvicorn==0.38.0
+    # via
+    #   gradio
+    #   mcp
+websockets==15.0.1
+    # via gradio-client
+xxhash==3.6.0
+    # via datasets
+yarl==1.22.0
+    # via aiohttp

search.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import datasets
+import numpy as np
+import spaces
+from sentence_transformers import CrossEncoder, SentenceTransformer
+from table import BASE_REPO_ID
+ds = datasets.load_dataset(BASE_REPO_ID, split="train")
+ds.add_faiss_index(column="embedding")
+bi_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
+ce_model = CrossEncoder("BAAI/bge-reranker-base")
+@spaces.GPU(duration=10)
+def search(query: str, candidate_pool_size: int = 100, retrieval_k: int = 50) -> list[dict]:
+    prefix = "Represent this sentence for searching relevant passages: "
+    q_vec = bi_model.encode(prefix + query, normalize_embeddings=True)
+    _, retrieved_ds = ds.get_nearest_examples("embedding", q_vec, k=candidate_pool_size)
+    ce_inputs = [
+        (query, f"{retrieved_ds['title'][i]} {retrieved_ds['abstract'][i]}") for i in range(len(retrieved_ds["title"]))
+    ]
+    ce_scores = ce_model.predict(ce_inputs, batch_size=16)
+    sorted_idx = np.argsort(ce_scores)[::-1]
+    return [
+        {"paper_id": retrieved_ds["paper_id"][i], "ce_score": float(ce_scores[i])} for i in sorted_idx[:retrieval_k]
+    ]

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+h1 {
+  text-align: center;
+  display: block;
+}

table.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import datasets
+import polars as pl
+BASE_REPO_ID = "ai-conferences/NeurIPS2025"
+PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
+def format_author_claim_ratio(row: dict) -> str:
+    n_linked_authors = row["n_linked_authors"]
+    n_authors = row["n_authors"]
+    if n_linked_authors is None or n_authors is None:
+        return ""
+    author_linked = "✅" if n_linked_authors > 0 else ""
+    return f"{n_linked_authors}/{n_authors} {author_linked}".strip()
+df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars()
+df_paper_page = (
+    datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
+    .to_polars()
+    .drop(["summary", "author_names", "ai_keywords"])
+)
+df_orig = (
+    df_orig.join(df_paper_page, on="arxiv_id", how="left", suffix="_2")
+    .with_columns(
+        [
+            pl.when(pl.col("github_2").is_not_null())
+            .then(pl.col("github_2"))
+            .otherwise(pl.col("github"))
+            .alias("github")
+        ]
+    )
+    .drop(["github_2"])
+)
+# format authors
+df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
+# format links
+df_orig = df_orig.with_columns(
+    [pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md") for col in ["project_page", "github"]]
+)
+# format paper page link
+df_orig = df_orig.with_columns(
+    (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
+).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
+# count authors
+df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
+df_orig = df_orig.with_columns(
+    pl.col("author_usernames")
+    .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
+    .alias("n_linked_authors")
+)
+df_orig = df_orig.with_columns(
+    pl.struct(["n_linked_authors", "n_authors"])
+    .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
+    .alias("claimed")
+)
+# TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002
+# format numbers as strings
+df_orig = df_orig.with_columns(
+    [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]]
+)
+# format spaces, models, datasets
+for repo_id_col, markdown_col, base_url in [
+    ("space_ids", "Spaces", "https://huggingface.co/spaces/"),
+    ("model_ids", "Models", "https://huggingface.co/"),
+    ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
+]:
+    df_orig = df_orig.with_columns(
+        pl.col(repo_id_col)
+        .map_elements(
+            lambda lst: "\n".join([f"[{x}]({base_url}{x})" for x in lst]) if lst is not None else None,  # noqa: B023
+            return_dtype=pl.Utf8,
+        )
+        .fill_null("")
+        .alias(markdown_col)
+    )

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff