File size: 2,915 Bytes
fb9f067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee0f55d
fb9f067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee0f55d
 
 
 
fb9f067
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa55708
fb9f067
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import datasets
import polars as pl

BASE_REPO_ID = "ai-conferences/ICCV2025"
PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"


def format_author_claim_ratio(row: dict) -> str:
    n_linked_authors = row["n_linked_authors"]
    n_authors = row["n_authors"]

    if n_linked_authors is None or n_authors is None:
        return ""

    author_linked = "✅" if n_linked_authors > 0 else ""
    return f"{n_linked_authors}/{n_authors} {author_linked}".strip()


df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars().rename({"cvf_url": "cvf"})
df_paper_page = (
    datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
    .to_polars()
    .drop(["summary", "author_names", "ai_keywords"])
)
df_orig = (
    df_orig.join(df_paper_page, on="arxiv_id", how="left", suffix="_2")
    .with_columns(
        [
            pl.when(pl.col("github_2").is_not_null())
            .then(pl.col("github_2"))
            .otherwise(pl.col("github"))
            .alias("github")
        ]
    )
    .drop(["github_2"])
)

# format authors
df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
# format links
df_orig = df_orig.with_columns(
    [
        pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md")
        for col in ["cvf", "project_page", "github"]
    ]
)
# format paper page link
df_orig = df_orig.with_columns(
    (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))

# count authors
df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
df_orig = df_orig.with_columns(
    pl.col("author_usernames")
    .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
    .alias("n_linked_authors")
)
df_orig = df_orig.with_columns(
    pl.struct(["n_linked_authors", "n_authors"])
    .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
    .alias("claimed")
)

# TODO: Fix this once https://github.com/gradio-app/gradio/issues/10916 is fixed # noqa: FIX002, TD002
# format numbers as strings
df_orig = df_orig.with_columns(
    [pl.col(col).cast(pl.Utf8).fill_null("").alias(col) for col in ["upvotes", "num_comments"]]
)

# format spaces, models, datasets
for repo_id_col, markdown_col, base_url in [
    ("space_ids", "Spaces", "https://huggingface.co/spaces/"),
    ("model_ids", "Models", "https://huggingface.co/"),
    ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
]:
    df_orig = df_orig.with_columns(
        pl.col(repo_id_col)
        .map_elements(
            lambda lst: "\n".join([f"[{x}]({base_url}{x})" for x in lst]) if lst is not None else None,  # noqa: B023
            return_dtype=pl.Utf8,
        )
        .fill_null("")
        .alias(markdown_col)
    )