File size: 7,926 Bytes
a2b2adc
 
 
 
 
 
 
 
 
ed5fc05
a2b2adc
 
 
 
334508f
a2b2adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed5fc05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2b2adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334508f
 
7ae0aef
 
 
 
 
 
 
 
a2b2adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334508f
a2b2adc
 
 
 
 
 
 
7ae0aef
a2b2adc
 
 
 
 
 
 
 
 
7ae0aef
a2b2adc
 
 
 
 
 
 
 
8e12b3e
a2b2adc
 
 
 
 
 
 
 
 
 
 
 
334508f
a2b2adc
 
 
 
 
334508f
a2b2adc
 
 
 
7ae0aef
a2b2adc
 
0c12a2b
a2b2adc
 
 
 
334508f
a2b2adc
 
 
 
 
 
 
 
334508f
a2b2adc
 
 
d1a334d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
from __future__ import annotations

import json
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any

import gradio as gr
from huggingface_hub import CommitScheduler, snapshot_download

# ------------------------------
# Config
# ------------------------------
DATASET_REPO_ID = "hugging-science/dataset-quest-index"  
COMMIT_EVERY_MIN = 2

LOCAL_SUBMISSIONS_DIR = Path("submissions")
LOCAL_SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
LOCAL_FILE = LOCAL_SUBMISSIONS_DIR / f"records_{uuid.uuid4().hex}.jsonl"

scheduler = CommitScheduler(
    repo_id=DATASET_REPO_ID,
    repo_type="dataset",
    folder_path=LOCAL_SUBMISSIONS_DIR,
    path_in_repo="data",
    every=COMMIT_EVERY_MIN,
)

# ------------------------------
# Utilities
# ------------------------------
def _now_iso() -> str:
    return datetime.utcnow().replace(microsecond=0).isoformat() + "Z"


def read_all_records() -> List[Dict[str, Any]]:
    records: List[Dict[str, Any]] = []

    local_files = sorted(LOCAL_SUBMISSIONS_DIR.glob("*.jsonl"))
    sources = list(local_files)

    if not sources:
        try:
            snap_dir = Path(snapshot_download(
                repo_id=DATASET_REPO_ID,
                repo_type="dataset",
                allow_patterns="data/*.jsonl"
            ))
            hub_data_dir = snap_dir / "data"
            sources = sorted(hub_data_dir.glob("*.jsonl"))
        except Exception:
            # If snapshot fails (e.g., offline), we just return empty
            sources = []
            
    for p in sources:
        try:
            with p.open("r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        records.append(json.loads(line))
                    except Exception:
                        pass
        except FileNotFoundError:
            pass
    return records


def append_record(record: Dict[str, Any]) -> None:
    LOCAL_FILE.parent.mkdir(parents=True, exist_ok=True)
    with LOCAL_FILE.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")


def filter_records(records: List[Dict[str, Any]], field: str | None, search: str | None) -> List[Dict[str, Any]]:
    def match(rec: Dict[str, Any]) -> bool:
        ok = True
        if field and field != "All":
            ok = ok and (rec.get("field") == field)
        if search:
            s = search.lower()
            hay = " ".join(
                str(rec.get(k, "")) for k in ["dataset_name", "dataset_url", "description", "user", "field"]
            ).lower()
            ok = ok and (s in hay)
        return ok

    return [r for r in records if match(r)]


# ------------------------------
# App logic
# ------------------------------
SIZE_UNITS = ["KB", "MB", "GB", "TB"]


def submit_entry(
    dataset_name: str,
    dataset_url: str,
    description: str,
    size_value: float,
    size_unit: str,
    field: str,
    profile: gr.OAuthProfile | None,
):
    errors = []
    if not dataset_name.strip():
        errors.append("Dataset name is required.")
    if not dataset_url.strip() or not dataset_url.startswith(("http://", "https://", "https://huggingface.co/")):
        errors.append("Dataset URL must be an http(s) link.")
    if size_value is None or size_value < 0:
        errors.append("Approximate size must be a non-negative number.")
    if not field.strip():
        errors.append("Please provide a field.")
    
    # Check for existing dataset URL and name
    existing_records = read_all_records()
    for record in existing_records:
        if record.get("dataset_url", "").strip().lower() == dataset_url.strip().lower():
            errors.append(f"Dataset URL already exists: {record.get('dataset_url')}")
        if record.get("dataset_name", "").strip().lower() == dataset_name.strip().lower():
            errors.append(f"Dataset name already exists: {record.get('dataset_name')}")

    if errors:
        return gr.update(value=f"Submission failed:\n- " + "\n- ".join(errors), visible=True), gr.update(visible=False)

    user_display = profile.name if profile else "anonymous"
    user_handle = getattr(profile, "preferred_username", None) if profile else None

    record = {
        "id": uuid.uuid4().hex,
        "created_at": _now_iso(),
        "dataset_name": dataset_name.strip(),
        "dataset_url": dataset_url.strip(),
        "description": description.strip(),
        "approx_size": float(size_value),
        "size_unit": size_unit,
        "field": field.strip(),
        "user": user_handle or user_display,
    }

    append_record(record)
    ok = f"Thanks, {user_display}. Your entry has been saved locally and will sync to the Hub within ~{COMMIT_EVERY_MIN} minutes."
    updated = read_all_records()
    rows = [
        [r["dataset_name"], f'<a href="{r["dataset_url"]}" target="_blank">{r["dataset_url"]}</a>', r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]]
        for r in updated
    ]
    return gr.update(value=ok, visible=True), rows


def refresh_table(field: str, search: str):
    data = read_all_records()
    data = filter_records(data, field, search)
    rows = [
        [r["dataset_name"], f'<a href="{r["dataset_url"]}" target="_blank">{r["dataset_url"]}</a>', r["description"], f"{r['approx_size']} {r['size_unit']}", r["field"], r["user"], r["created_at"]]
        for r in data
    ]
    return rows


# ------------------------------
# UI
# ------------------------------
with gr.Blocks(title="Community Dataset Index", css=".wrap {margin: 0 auto}", fill_width=True) as demo:
    gr.Markdown("# Community Dataset Index\nContribute datasets with a short description. Sign in to record your HF username.")
    gr.LoginButton()

    with gr.Row(elem_classes=["wrap"]):
        with gr.Column(scale=1):
            gr.Markdown("### Submit a dataset")
            name = gr.Textbox(label="Dataset name", placeholder="e.g. The Pile")
            url = gr.Textbox(label="Dataset URL (HF, website or paper)", placeholder="https://huggingface.co/datasets/... or https://...")
            desc = gr.Textbox(label="Short description", lines=4)
            with gr.Row():
                size_val = gr.Number(label="Approx. size", minimum=0, value=0)
                size_unit = gr.Dropdown(SIZE_UNITS, value="GB", label="Unit")
            field = gr.Textbox(label="Field (e.g. PDEs, multi-omics, single-cell, catalysts, etc.)")
            submit = gr.Button("Submit", variant="primary")
            notice = gr.Markdown(visible=False)
        with gr.Column(scale=2):
            gr.Markdown("### Browse & filter")
            with gr.Row():
                field_filter = gr.Textbox(label="Field filter (leave blank for all)")
                search = gr.Textbox(label="Search", placeholder="Search name, URL, description, user…")
                refresh = gr.Button("Refresh")
            table = gr.Dataframe(
                headers=["Name", "URL", "Description", "Size", "Field", "User", "Created"],
                datatype=["str", "html", "str", "str", "str", "str", "str"],
                interactive=False,
                wrap=True,
                show_fullscreen_button=True,
            )

    submit.click(
        submit_entry,
        inputs=[name, url, desc, size_val, size_unit, field],
        outputs=[notice, table],
        show_progress="minimal",
    )

    refresh.click(refresh_table, inputs=[field_filter, search], outputs=table)
    field_filter.change(refresh_table, inputs=[field_filter, search], outputs=table)
    search.submit(refresh_table, inputs=[field_filter, search], outputs=table)

    demo.load(lambda: refresh_table("", ""), inputs=None, outputs=table)


if __name__ == "__main__":
    demo.launch(ssr_mode=False)