hysts HF Staff commited on
Commit
6f1aa4e
·
1 Parent(s): 95c535e
Files changed (14) hide show
  1. .gitignore +162 -0
  2. .pre-commit-config.yaml +33 -0
  3. .python-version +1 -0
  4. .vscode/extensions.json +8 -0
  5. .vscode/settings.json +17 -0
  6. README.md +3 -3
  7. app.py +204 -0
  8. app_mcp.py +127 -0
  9. pyproject.toml +56 -0
  10. requirements.txt +365 -0
  11. search.py +30 -0
  12. style.css +4 -0
  13. table.py +82 -0
  14. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .gradio/
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+ db.sqlite3
64
+ db.sqlite3-journal
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # poetry
100
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104
+ #poetry.lock
105
+
106
+ # pdm
107
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108
+ #pdm.lock
109
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110
+ # in version control.
111
+ # https://pdm.fming.dev/#use-with-ide
112
+ .pdm.toml
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
+ hooks:
5
+ - id: check-executables-have-shebangs
6
+ - id: check-json
7
+ - id: check-merge-conflict
8
+ - id: check-shebang-scripts-are-executable
9
+ - id: check-toml
10
+ - id: check-yaml
11
+ - id: end-of-file-fixer
12
+ - id: mixed-line-ending
13
+ args: ["--fix=lf"]
14
+ - id: requirements-txt-fixer
15
+ - id: trailing-whitespace
16
+ - repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.14.2
18
+ hooks:
19
+ - id: ruff-check
20
+ args: ["--fix"]
21
+ - id: ruff-format
22
+ - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.18.2
24
+ hooks:
25
+ - id: mypy
26
+ args: ["--ignore-missing-imports"]
27
+ additional_dependencies:
28
+ [
29
+ "types-python-slugify",
30
+ "types-pytz",
31
+ "types-PyYAML",
32
+ "types-requests",
33
+ ]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
.vscode/extensions.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "recommendations": [
3
+ "ms-python.python",
4
+ "charliermarsh.ruff",
5
+ "streetsidesoftware.code-spell-checker",
6
+ "tamasfe.even-better-toml"
7
+ ]
8
+ }
.vscode/settings.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.formatOnSave": true,
3
+ "files.insertFinalNewline": false,
4
+ "[python]": {
5
+ "editor.defaultFormatter": "charliermarsh.ruff",
6
+ "editor.formatOnType": true,
7
+ "editor.codeActionsOnSave": {
8
+ "source.fixAll.ruff": "explicit",
9
+ "source.organizeImports": "explicit"
10
+ }
11
+ },
12
+ "[jupyter]": {
13
+ "files.insertFinalNewline": false
14
+ },
15
+ "notebook.output.scrolling": true,
16
+ "notebook.formatOnSave.enabled": true
17
+ }
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: NeurIPS2025
3
- emoji: 🐠
4
  colorFrom: red
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
 
1
  ---
2
+ title: NeurIPS 2025
3
+ emoji:
4
  colorFrom: red
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import gradio as gr
4
+ import polars as pl
5
+ from gradio_modal import Modal
6
+
7
+ from app_mcp import demo as demo_mcp
8
+ from search import search
9
+ from table import df_orig
10
+
11
+ DESCRIPTION = "# NeurIPS 2025"
12
+
13
+ df_main = df_orig.select(
14
+ "title",
15
+ "authors_str",
16
+ "paper_page_md",
17
+ "upvotes",
18
+ "num_comments",
19
+ "project_page_md",
20
+ "github_md",
21
+ "Spaces",
22
+ "Models",
23
+ "Datasets",
24
+ "claimed",
25
+ "abstract",
26
+ "paper_id",
27
+ )
28
+
29
+ # TODO: remove this once https://github.com/gradio-app/gradio/issues/10916 https://github.com/gradio-app/gradio/issues/11001 https://github.com/gradio-app/gradio/issues/11002 are fixed # noqa: TD002, FIX002
30
+ df_main = df_main.with_columns(
31
+ [
32
+ pl.when(pl.col(col) == "").then(None).otherwise(pl.col(col)).cast(pl.Int64).fill_null(0).alias(col)
33
+ for col in ["upvotes", "num_comments"]
34
+ ]
35
+ )
36
+
37
+ df_main = df_main.rename(
38
+ {
39
+ "title": "Title",
40
+ "authors_str": "Authors",
41
+ "paper_page_md": "Paper page",
42
+ "upvotes": "👍",
43
+ "num_comments": "💬",
44
+ "project_page_md": "Project page",
45
+ "github_md": "GitHub",
46
+ }
47
+ )
48
+
49
+ COLUMN_INFO = {
50
+ "Title": ("str", "40%"),
51
+ "Authors": ("str", "20%"),
52
+ "Paper page": ("markdown", "135px"),
53
+ "👍": ("number", "50px"),
54
+ "💬": ("number", "50px"),
55
+ "Project page": ("markdown", None),
56
+ "GitHub": ("markdown", None),
57
+ "Spaces": ("markdown", None),
58
+ "Models": ("markdown", None),
59
+ "Datasets": ("markdown", None),
60
+ "claimed": ("markdown", None),
61
+ }
62
+
63
+
64
+ DEFAULT_COLUMNS = [
65
+ "Title",
66
+ "Paper page",
67
+ "👍",
68
+ "💬",
69
+ "Project page",
70
+ "GitHub",
71
+ "Spaces",
72
+ "Models",
73
+ "Datasets",
74
+ ]
75
+
76
+
77
+ def update_num_papers(df: pl.DataFrame) -> str:
78
+ if "claimed" in df.columns:
79
+ return f"{len(df)} / {len(df_main)} ({df.select(pl.col('claimed').str.contains('✅').sum()).item()} claimed)"
80
+ return f"{len(df)} / {len(df_main)}"
81
+
82
+
83
+ def update_df(
84
+ search_query: str,
85
+ candidate_pool_size: int,
86
+ num_results: int,
87
+ column_names: list[str],
88
+ ) -> gr.Dataframe:
89
+ if num_results > candidate_pool_size:
90
+ raise gr.Error("Number of results must be less than or equal to candidate pool size", print_exception=False)
91
+
92
+ df = df_main.clone()
93
+ column_names = ["Title", *column_names]
94
+
95
+ if search_query:
96
+ results = search(search_query, candidate_pool_size, num_results)
97
+ if not results:
98
+ df = df.head(0)
99
+ else:
100
+ df = pl.DataFrame(results).join(df, on="paper_id", how="inner")
101
+ df = df.sort("ce_score", descending=True).drop("ce_score")
102
+
103
+ sorted_column_names = [col for col in COLUMN_INFO if col in column_names]
104
+ df = df.select(sorted_column_names)
105
+ return gr.Dataframe(
106
+ value=df,
107
+ datatype=[COLUMN_INFO[col][0] for col in sorted_column_names],
108
+ column_widths=[COLUMN_INFO[col][1] for col in sorted_column_names],
109
+ )
110
+
111
+
112
+ def df_row_selected(
113
+ evt: gr.SelectData,
114
+ ) -> tuple[
115
+ Modal,
116
+ gr.Textbox, # title
117
+ gr.Textbox, # abstract
118
+ ]:
119
+ if evt.index[1] != 0:
120
+ return Modal(), gr.Textbox(), gr.Textbox()
121
+
122
+ title = evt.row_value[0]
123
+ row = df_main.filter(pl.col("Title") == title)
124
+ return (
125
+ Modal(visible=True),
126
+ gr.Textbox(value=row["Title"].item()), # title
127
+ gr.Textbox(value=row["abstract"].item()), # abstract
128
+ )
129
+
130
+
131
+ with gr.Blocks(css_paths="style.css") as demo:
132
+ gr.Markdown(DESCRIPTION)
133
+ search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Search...")
134
+ with gr.Accordion(label="Advanced Search Options", open=False) as advanced_search_options:
135
+ with gr.Row():
136
+ candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=600, step=1, value=200)
137
+ num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
138
+
139
+ column_names = gr.CheckboxGroup(
140
+ label="Columns",
141
+ choices=[col for col in COLUMN_INFO if col != "Title"],
142
+ value=[col for col in DEFAULT_COLUMNS if col != "Title"],
143
+ )
144
+
145
+ num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(df_orig), interactive=False)
146
+
147
+ df = gr.Dataframe(
148
+ value=df_main,
149
+ datatype=list(COLUMN_INFO.values()),
150
+ type="polars",
151
+ row_count=(0, "dynamic"),
152
+ show_row_numbers=True,
153
+ interactive=False,
154
+ max_height=1000,
155
+ elem_id="table",
156
+ column_widths=[COLUMN_INFO[col][1] for col in COLUMN_INFO],
157
+ )
158
+ with Modal(visible=False, elem_id="abstract-modal") as abstract_modal:
159
+ title = gr.Textbox(label="Title")
160
+ abstract = gr.Textbox(label="Abstract")
161
+
162
+ df.select(fn=df_row_selected, outputs=[abstract_modal, title, abstract])
163
+
164
+ inputs = [
165
+ search_query,
166
+ candidate_pool_size,
167
+ num_results,
168
+ column_names,
169
+ ]
170
+ gr.on(
171
+ triggers=[
172
+ search_query.submit,
173
+ column_names.input,
174
+ ],
175
+ fn=update_df,
176
+ inputs=inputs,
177
+ outputs=df,
178
+ api_name=False,
179
+ ).then(
180
+ fn=update_num_papers,
181
+ inputs=df,
182
+ outputs=num_papers,
183
+ queue=False,
184
+ api_name=False,
185
+ )
186
+ demo.load(
187
+ fn=update_df,
188
+ inputs=inputs,
189
+ outputs=df,
190
+ api_name=False,
191
+ ).then(
192
+ fn=update_num_papers,
193
+ inputs=df,
194
+ outputs=num_papers,
195
+ queue=False,
196
+ api_name=False,
197
+ )
198
+
199
+ with gr.Row(visible=False):
200
+ demo_mcp.render()
201
+
202
+
203
+ if __name__ == "__main__":
204
+ demo.launch(mcp_server=True)
app_mcp.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import polars as pl
3
+
4
+ from search import search
5
+ from table import df_orig
6
+
7
+ COLUMNS_MCP = [
8
+ "title",
9
+ "authors",
10
+ "abstract",
11
+ "arxiv_id",
12
+ "paper_page",
13
+ "space_ids",
14
+ "model_ids",
15
+ "dataset_ids",
16
+ "upvotes",
17
+ "num_comments",
18
+ "project_page",
19
+ "github",
20
+ "row_index",
21
+ ]
22
+ DEFAULT_COLUMNS_MCP = [
23
+ "title",
24
+ "authors",
25
+ "abstract",
26
+ "arxiv_id",
27
+ "project_page",
28
+ "github",
29
+ "row_index",
30
+ ]
31
+
32
+ df_mcp = df_orig.rename({"paper_id": "row_index"}).select(COLUMNS_MCP)
33
+
34
+
35
+ def search_papers(
36
+ search_query: str,
37
+ candidate_pool_size: int,
38
+ num_results: int,
39
+ columns: list[str],
40
+ ) -> list[dict]:
41
+ """Searches NeurIPS 2025 papers relevant to a user query in English.
42
+
43
+ This function performs a semantic search over NeurIPS 2025 papers.
44
+ It uses a dual-stage retrieval process:
45
+ - First, it retrieves `candidate_pool_size` papers using dense vector similarity.
46
+ - Then, it re-ranks them with a cross-encoder model to select the top `num_results` most relevant papers.
47
+ - The search results are returned as a list of dictionaries.
48
+
49
+ Note:
50
+ The search query must be written in English. Queries in other languages are not supported.
51
+
52
+ Args:
53
+ search_query (str): The natural language query input by the user. Must be in English.
54
+ candidate_pool_size (int): Number of candidate papers to retrieve using the dense vector model.
55
+ num_results (int): Final number of top-ranked papers to return after re-ranking.
56
+ columns (list[str]): The columns to select from the DataFrame.
57
+
58
+ Returns:
59
+ list[dict]: A list of dictionaries of the top-ranked papers matching the query, sorted by relevance.
60
+ """
61
+ if not search_query:
62
+ raise ValueError("Search query cannot be empty")
63
+ if num_results > candidate_pool_size:
64
+ raise ValueError("Number of results must be less than or equal to candidate pool size")
65
+
66
+ df = df_mcp.clone()
67
+ results = search(search_query, candidate_pool_size, num_results)
68
+ df = pl.DataFrame(results).rename({"paper_id": "row_index"}).join(df, on="row_index", how="inner")
69
+ df = df.sort("ce_score", descending=True)
70
+ return df.select(columns).to_dicts()
71
+
72
+
73
+ def get_metadata(row_index: int) -> dict:
74
+ """Returns a dictionary of metadata for a NeurIPS 2025 paper at the given table row index.
75
+
76
+ Args:
77
+ row_index (int): The index of the paper in the internal paper list table.
78
+
79
+ Returns:
80
+ dict: A dictionary containing metadata for the corresponding paper.
81
+ """
82
+ return df_mcp.filter(pl.col("row_index") == row_index).to_dicts()[0]
83
+
84
+
85
+ def get_table(columns: list[str]) -> list[dict]:
86
+ """Returns a list of dictionaries of all NeurIPS 2025 papers.
87
+
88
+ Args:
89
+ columns (list[str]): The columns to select from the DataFrame.
90
+
91
+ Returns:
92
+ list[dict]: A list of dictionaries of all NeurIPS 2025 papers.
93
+ """
94
+ return df_mcp.select(columns).to_dicts()
95
+
96
+
97
+ with gr.Blocks() as demo:
98
+ search_query = gr.Textbox(label="Search", submit_btn=True)
99
+ candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=500, step=1, value=200)
100
+ num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
101
+ column_names = gr.CheckboxGroup(label="Columns", choices=COLUMNS_MCP, value=DEFAULT_COLUMNS_MCP)
102
+ row_index = gr.Slider(label="Row Index", minimum=0, maximum=len(df_mcp) - 1, step=1, value=0)
103
+
104
+ out = gr.JSON()
105
+
106
+ search_papers_btn = gr.Button("Search Papers")
107
+ get_metadata_btn = gr.Button("Get Metadata")
108
+ get_table_btn = gr.Button("Get Table")
109
+
110
+ search_papers_btn.click(
111
+ fn=search_papers,
112
+ inputs=[search_query, candidate_pool_size, num_results, column_names],
113
+ outputs=out,
114
+ )
115
+ get_metadata_btn.click(
116
+ fn=get_metadata,
117
+ inputs=row_index,
118
+ outputs=out,
119
+ )
120
+ get_table_btn.click(
121
+ fn=get_table,
122
+ inputs=column_names,
123
+ outputs=out,
124
+ )
125
+
126
+ if __name__ == "__main__":
127
+ demo.launch(mcp_server=True)
pyproject.toml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "neurips2025"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "datasets>=4.3.0",
9
+ "gradio[mcp]>=5.49.1",
10
+ "gradio-modal>=0.0.4",
11
+ "polars>=1.34.0",
12
+ "sentence-transformers>=5.1.2",
13
+ "spaces>=0.42.1",
14
+ "torch==2.8.0",
15
+ "faiss-cpu>=1.12.0",
16
+ ]
17
+
18
+ [tool.ruff]
19
+ line-length = 119
20
+
21
+ [tool.ruff.lint]
22
+ select = ["ALL"]
23
+ ignore = [
24
+ "COM812", # missing-trailing-comma
25
+ "D203", # one-blank-line-before-class
26
+ "D213", # multi-line-summary-second-line
27
+ "E501", # line-too-long
28
+ "SIM117", # multiple-with-statements
29
+ #
30
+ "D100", # undocumented-public-module
31
+ "D101", # undocumented-public-class
32
+ "D102", # undocumented-public-method
33
+ "D103", # undocumented-public-function
34
+ "D104", # undocumented-public-package
35
+ "D105", # undocumented-magic-method
36
+ "D107", # undocumented-public-init
37
+ "EM101", # raw-string-in-exception
38
+ "FBT001", # boolean-type-hint-positional-argument
39
+ "FBT002", # boolean-default-value-positional-argument
40
+ "PGH003", # blanket-type-ignore
41
+ "PLR0913", # too-many-arguments
42
+ "PLR0915", # too-many-statements
43
+ "TRY003", # raise-vanilla-args
44
+ ]
45
+ unfixable = [
46
+ "F401", # unused-import
47
+ ]
48
+
49
+ [tool.ruff.lint.pydocstyle]
50
+ convention = "google"
51
+
52
+ [tool.ruff.lint.per-file-ignores]
53
+ "*.ipynb" = ["T201", "T203"]
54
+
55
+ [tool.ruff.format]
56
+ docstring-code-format = true
requirements.txt ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ aiohappyeyeballs==2.6.1
6
+ # via aiohttp
7
+ aiohttp==3.13.1
8
+ # via fsspec
9
+ aiosignal==1.4.0
10
+ # via aiohttp
11
+ annotated-doc==0.0.2
12
+ # via fastapi
13
+ annotated-types==0.7.0
14
+ # via pydantic
15
+ anyio==4.11.0
16
+ # via
17
+ # gradio
18
+ # httpx
19
+ # mcp
20
+ # sse-starlette
21
+ # starlette
22
+ async-timeout==5.0.1
23
+ # via aiohttp
24
+ attrs==25.4.0
25
+ # via
26
+ # aiohttp
27
+ # jsonschema
28
+ # referencing
29
+ brotli==1.1.0
30
+ # via gradio
31
+ certifi==2025.10.5
32
+ # via
33
+ # httpcore
34
+ # httpx
35
+ # requests
36
+ charset-normalizer==3.4.4
37
+ # via requests
38
+ click==8.3.0
39
+ # via
40
+ # typer
41
+ # uvicorn
42
+ datasets==4.3.0
43
+ # via neurips2025 (pyproject.toml)
44
+ dill==0.4.0
45
+ # via
46
+ # datasets
47
+ # multiprocess
48
+ exceptiongroup==1.3.0
49
+ # via anyio
50
+ faiss-cpu==1.12.0
51
+ # via neurips2025 (pyproject.toml)
52
+ fastapi==0.120.0
53
+ # via gradio
54
+ ffmpy==0.6.4
55
+ # via gradio
56
+ filelock==3.20.0
57
+ # via
58
+ # datasets
59
+ # huggingface-hub
60
+ # torch
61
+ # transformers
62
+ frozenlist==1.8.0
63
+ # via
64
+ # aiohttp
65
+ # aiosignal
66
+ fsspec==2025.9.0
67
+ # via
68
+ # datasets
69
+ # gradio-client
70
+ # huggingface-hub
71
+ # torch
72
+ gradio==5.49.1
73
+ # via
74
+ # neurips2025 (pyproject.toml)
75
+ # gradio-modal
76
+ # spaces
77
+ gradio-client==1.13.3
78
+ # via gradio
79
+ gradio-modal==0.0.4
80
+ # via neurips2025 (pyproject.toml)
81
+ groovy==0.1.2
82
+ # via gradio
83
+ h11==0.16.0
84
+ # via
85
+ # httpcore
86
+ # uvicorn
87
+ hf-xet==1.1.10
88
+ # via huggingface-hub
89
+ httpcore==1.0.9
90
+ # via httpx
91
+ httpx==0.28.1
92
+ # via
93
+ # datasets
94
+ # gradio
95
+ # gradio-client
96
+ # mcp
97
+ # safehttpx
98
+ # spaces
99
+ httpx-sse==0.4.3
100
+ # via mcp
101
+ huggingface-hub==0.36.0
102
+ # via
103
+ # datasets
104
+ # gradio
105
+ # gradio-client
106
+ # sentence-transformers
107
+ # tokenizers
108
+ # transformers
109
+ idna==3.11
110
+ # via
111
+ # anyio
112
+ # httpx
113
+ # requests
114
+ # yarl
115
+ jinja2==3.1.6
116
+ # via
117
+ # gradio
118
+ # torch
119
+ joblib==1.5.2
120
+ # via scikit-learn
121
+ jsonschema==4.25.1
122
+ # via mcp
123
+ jsonschema-specifications==2025.9.1
124
+ # via jsonschema
125
+ markdown-it-py==4.0.0
126
+ # via rich
127
+ markupsafe==3.0.3
128
+ # via
129
+ # gradio
130
+ # jinja2
131
+ mcp==1.10.1
132
+ # via gradio
133
+ mdurl==0.1.2
134
+ # via markdown-it-py
135
+ mpmath==1.3.0
136
+ # via sympy
137
+ multidict==6.7.0
138
+ # via
139
+ # aiohttp
140
+ # yarl
141
+ multiprocess==0.70.16
142
+ # via datasets
143
+ networkx==3.4.2
144
+ # via torch
145
+ numpy==2.2.6
146
+ # via
147
+ # datasets
148
+ # faiss-cpu
149
+ # gradio
150
+ # pandas
151
+ # scikit-learn
152
+ # scipy
153
+ # transformers
154
+ nvidia-cublas-cu12==12.8.4.1
155
+ # via
156
+ # nvidia-cudnn-cu12
157
+ # nvidia-cusolver-cu12
158
+ # torch
159
+ nvidia-cuda-cupti-cu12==12.8.90
160
+ # via torch
161
+ nvidia-cuda-nvrtc-cu12==12.8.93
162
+ # via torch
163
+ nvidia-cuda-runtime-cu12==12.8.90
164
+ # via torch
165
+ nvidia-cudnn-cu12==9.10.2.21
166
+ # via torch
167
+ nvidia-cufft-cu12==11.3.3.83
168
+ # via torch
169
+ nvidia-cufile-cu12==1.13.1.3
170
+ # via torch
171
+ nvidia-curand-cu12==10.3.9.90
172
+ # via torch
173
+ nvidia-cusolver-cu12==11.7.3.90
174
+ # via torch
175
+ nvidia-cusparse-cu12==12.5.8.93
176
+ # via
177
+ # nvidia-cusolver-cu12
178
+ # torch
179
+ nvidia-cusparselt-cu12==0.7.1
180
+ # via torch
181
+ nvidia-nccl-cu12==2.27.3
182
+ # via torch
183
+ nvidia-nvjitlink-cu12==12.8.93
184
+ # via
185
+ # nvidia-cufft-cu12
186
+ # nvidia-cusolver-cu12
187
+ # nvidia-cusparse-cu12
188
+ # torch
189
+ nvidia-nvtx-cu12==12.8.90
190
+ # via torch
191
+ orjson==3.11.3
192
+ # via gradio
193
+ packaging==25.0
194
+ # via
195
+ # datasets
196
+ # faiss-cpu
197
+ # gradio
198
+ # gradio-client
199
+ # huggingface-hub
200
+ # spaces
201
+ # transformers
202
+ pandas==2.3.3
203
+ # via
204
+ # datasets
205
+ # gradio
206
+ pillow==11.3.0
207
+ # via
208
+ # gradio
209
+ # sentence-transformers
210
+ polars==1.34.0
211
+ # via neurips2025 (pyproject.toml)
212
+ polars-runtime-32==1.34.0
213
+ # via polars
214
+ propcache==0.4.1
215
+ # via
216
+ # aiohttp
217
+ # yarl
218
+ psutil==5.9.8
219
+ # via spaces
220
+ pyarrow==21.0.0
221
+ # via datasets
222
+ pydantic==2.11.10
223
+ # via
224
+ # fastapi
225
+ # gradio
226
+ # mcp
227
+ # pydantic-settings
228
+ # spaces
229
+ pydantic-core==2.33.2
230
+ # via pydantic
231
+ pydantic-settings==2.11.0
232
+ # via mcp
233
+ pydub==0.25.1
234
+ # via gradio
235
+ pygments==2.19.2
236
+ # via rich
237
+ python-dateutil==2.9.0.post0
238
+ # via pandas
239
+ python-dotenv==1.1.1
240
+ # via pydantic-settings
241
+ python-multipart==0.0.20
242
+ # via
243
+ # gradio
244
+ # mcp
245
+ pytz==2025.2
246
+ # via pandas
247
+ pyyaml==6.0.3
248
+ # via
249
+ # datasets
250
+ # gradio
251
+ # huggingface-hub
252
+ # transformers
253
+ referencing==0.37.0
254
+ # via
255
+ # jsonschema
256
+ # jsonschema-specifications
257
+ regex==2025.10.23
258
+ # via transformers
259
+ requests==2.32.5
260
+ # via
261
+ # datasets
262
+ # huggingface-hub
263
+ # spaces
264
+ # transformers
265
+ rich==14.2.0
266
+ # via typer
267
+ rpds-py==0.28.0
268
+ # via
269
+ # jsonschema
270
+ # referencing
271
+ ruff==0.14.2
272
+ # via gradio
273
+ safehttpx==0.1.6
274
+ # via gradio
275
+ safetensors==0.6.2
276
+ # via transformers
277
+ scikit-learn==1.7.2
278
+ # via sentence-transformers
279
+ scipy==1.15.3
280
+ # via
281
+ # scikit-learn
282
+ # sentence-transformers
283
+ semantic-version==2.10.0
284
+ # via gradio
285
+ sentence-transformers==5.1.2
286
+ # via neurips2025 (pyproject.toml)
287
+ setuptools==80.9.0
288
+ # via triton
289
+ shellingham==1.5.4
290
+ # via typer
291
+ six==1.17.0
292
+ # via python-dateutil
293
+ sniffio==1.3.1
294
+ # via anyio
295
+ spaces==0.42.1
296
+ # via neurips2025 (pyproject.toml)
297
+ sse-starlette==3.0.2
298
+ # via mcp
299
+ starlette==0.48.0
300
+ # via
301
+ # fastapi
302
+ # gradio
303
+ # mcp
304
+ sympy==1.14.0
305
+ # via torch
306
+ threadpoolctl==3.6.0
307
+ # via scikit-learn
308
+ tokenizers==0.22.1
309
+ # via transformers
310
+ tomlkit==0.13.3
311
+ # via gradio
312
+ torch==2.8.0
313
+ # via
314
+ # neurips2025 (pyproject.toml)
315
+ # sentence-transformers
316
+ tqdm==4.67.1
317
+ # via
318
+ # datasets
319
+ # huggingface-hub
320
+ # sentence-transformers
321
+ # transformers
322
+ transformers==4.57.1
323
+ # via sentence-transformers
324
+ triton==3.4.0
325
+ # via torch
326
+ typer==0.20.0
327
+ # via gradio
328
+ typing-extensions==4.15.0
329
+ # via
330
+ # aiosignal
331
+ # anyio
332
+ # exceptiongroup
333
+ # fastapi
334
+ # gradio
335
+ # gradio-client
336
+ # huggingface-hub
337
+ # multidict
338
+ # pydantic
339
+ # pydantic-core
340
+ # referencing
341
+ # sentence-transformers
342
+ # spaces
343
+ # starlette
344
+ # torch
345
+ # typer
346
+ # typing-inspection
347
+ # uvicorn
348
+ typing-inspection==0.4.2
349
+ # via
350
+ # pydantic
351
+ # pydantic-settings
352
+ tzdata==2025.2
353
+ # via pandas
354
+ urllib3==2.5.0
355
+ # via requests
356
+ uvicorn==0.38.0
357
+ # via
358
+ # gradio
359
+ # mcp
360
+ websockets==15.0.1
361
+ # via gradio-client
362
+ xxhash==3.6.0
363
+ # via datasets
364
+ yarl==1.22.0
365
+ # via aiohttp
search.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import numpy as np
3
+ import spaces
4
+ from sentence_transformers import CrossEncoder, SentenceTransformer
5
+
6
+ from table import BASE_REPO_ID
7
+
8
+ ds = datasets.load_dataset(BASE_REPO_ID, split="train")
9
+ ds.add_faiss_index(column="embedding")
10
+
11
+ bi_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
12
+ ce_model = CrossEncoder("BAAI/bge-reranker-base")
13
+
14
+
15
+ @spaces.GPU(duration=10)
16
+ def search(query: str, candidate_pool_size: int = 100, retrieval_k: int = 50) -> list[dict]:
17
+ prefix = "Represent this sentence for searching relevant passages: "
18
+ q_vec = bi_model.encode(prefix + query, normalize_embeddings=True)
19
+
20
+ _, retrieved_ds = ds.get_nearest_examples("embedding", q_vec, k=candidate_pool_size)
21
+
22
+ ce_inputs = [
23
+ (query, f"{retrieved_ds['title'][i]} {retrieved_ds['abstract'][i]}") for i in range(len(retrieved_ds["title"]))
24
+ ]
25
+ ce_scores = ce_model.predict(ce_inputs, batch_size=16)
26
+
27
+ sorted_idx = np.argsort(ce_scores)[::-1]
28
+ return [
29
+ {"paper_id": retrieved_ds["paper_id"][i], "ce_score": float(ce_scores[i])} for i in sorted_idx[:retrieval_k]
30
+ ]
style.css ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ display: block;
4
+ }
table.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import polars as pl
3
+
4
+ BASE_REPO_ID = "ai-conferences/NeurIPS2025"
5
+ PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
6
+
7
+
8
+ def format_author_claim_ratio(row: dict) -> str:
9
+ n_linked_authors = row["n_linked_authors"]
10
+ n_authors = row["n_authors"]
11
+
12
+ if n_linked_authors is None or n_authors is None:
13
+ return ""
14
+
15
+ author_linked = "✅" if n_linked_authors > 0 else ""
16
+ return f"{n_linked_authors}/{n_authors} {author_linked}".strip()
17
+
18
+
19
+ df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars()
20
+ df_paper_page = (
21
+ datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
22
+ .to_polars()
23
+ .drop(["summary", "author_names", "ai_keywords"])
24
+ )
25
+ df_orig = (
26
+ df_orig.join(df_paper_page, on="arxiv_id", how="left", suffix="_2")
27
+ .with_columns(
28
+ [
29
+ pl.when(pl.col("github_2").is_not_null())
30
+ .then(pl.col("github_2"))
31
+ .otherwise(pl.col("github"))
32
+ .alias("github")
33
+ ]
34
+ )
35
+ .drop(["github_2"])
36
+ )
37
+
38
+ # format authors
39
+ df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
40
+ # format links
41
+ df_orig = df_orig.with_columns(
42
+ [pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md") for col in ["project_page", "github"]]
43
+ )
44
+ # format paper page link
45
+ df_orig = df_orig.with_columns(
46
+ (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
47
+ ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
48
+
49
+ # count authors
50
+ df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
51
+ df_orig = df_orig.with_columns(
52
+ pl.col("author_usernames")
53
+ .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
54
+ .alias("n_linked_authors")
55
+ )
56
+ df_orig = df_orig.with_columns(
57
+ pl.struct(["n_linked_authors", "n_authors"])
58
+ .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
59
+ .alias("claimed")
60
+ )
61
+
62
+ # TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002
63
+ # format numbers as strings
64
+ df_orig = df_orig.with_columns(
65
+ [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]]
66
+ )
67
+
68
+ # format spaces, models, datasets
69
+ for repo_id_col, markdown_col, base_url in [
70
+ ("space_ids", "Spaces", "https://huggingface.co/spaces/"),
71
+ ("model_ids", "Models", "https://huggingface.co/"),
72
+ ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
73
+ ]:
74
+ df_orig = df_orig.with_columns(
75
+ pl.col(repo_id_col)
76
+ .map_elements(
77
+ lambda lst: "\n".join([f"[{x}]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023
78
+ return_dtype=pl.Utf8,
79
+ )
80
+ .fill_null("")
81
+ .alias(markdown_col)
82
+ )
uv.lock ADDED
The diff for this file is too large to render. See raw diff