Spaces:
Running
Running
clean up design
Browse files
app.py
CHANGED
|
@@ -1,16 +1,3 @@
|
|
| 1 |
-
# import os
|
| 2 |
-
# _LOCAL_TMP = "/fsx/guilherme/tmp"
|
| 3 |
-
# try:
|
| 4 |
-
# os.makedirs(_LOCAL_TMP, exist_ok=True)
|
| 5 |
-
# os.environ.setdefault("TMPDIR", _LOCAL_TMP)
|
| 6 |
-
# os.environ.setdefault("TEMP", _LOCAL_TMP)
|
| 7 |
-
# os.environ.setdefault("TMP", _LOCAL_TMP)
|
| 8 |
-
# _GRADIO_TMP = os.path.join(_LOCAL_TMP, "gradio")
|
| 9 |
-
# os.makedirs(_GRADIO_TMP, exist_ok=True)
|
| 10 |
-
# os.environ.setdefault("GRADIO_TEMP_DIR", _GRADIO_TMP)
|
| 11 |
-
# except Exception:
|
| 12 |
-
# pass
|
| 13 |
-
|
| 14 |
import gradio as gr
|
| 15 |
from datatrove.pipeline.readers import ParquetReader
|
| 16 |
from default_wiki_pipeline import _parse_and_clean_wikicode, mwparserfromhell
|
|
@@ -39,7 +26,7 @@ def matches_filters(doc, require_has_math: bool | None, require_has_infobox: boo
|
|
| 39 |
meta = doc.metadata or {}
|
| 40 |
if require_has_math and not bool(meta.get("has_math")):
|
| 41 |
return False
|
| 42 |
-
if require_has_infobox and not meta.get("infoboxes"):
|
| 43 |
return False
|
| 44 |
return True
|
| 45 |
|
|
@@ -243,6 +230,18 @@ def on_next(docs_cache, idx: int, reader_iter, require_has_math: bool, require_h
|
|
| 243 |
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 244 |
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
with gr.Blocks() as demo:
|
| 247 |
idx_state = gr.State(value=-1, time_to_live=900)
|
| 248 |
docs_state = gr.State(value=[], time_to_live=900)
|
|
@@ -256,8 +255,6 @@ with gr.Blocks() as demo:
|
|
| 256 |
with gr.Row():
|
| 257 |
prev_btn = gr.Button("Previous")
|
| 258 |
next_btn = gr.Button("Next")
|
| 259 |
-
left_text = gr.Textbox(label="FineWiki extraction", lines=30)
|
| 260 |
-
left_meta = gr.JSON(label="Metadata")
|
| 261 |
|
| 262 |
with gr.Column():
|
| 263 |
with gr.Row():
|
|
@@ -266,55 +263,40 @@ with gr.Blocks() as demo:
|
|
| 266 |
with gr.Column(scale=1):
|
| 267 |
require_has_math = gr.Checkbox(label="Has math", value=False)
|
| 268 |
require_has_infobox = gr.Checkbox(label="Has infobox", value=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
with gr.Tab("Preview"):
|
| 270 |
right_iframe = gr.HTML(label="Original Page")
|
| 271 |
with gr.Tab("wikimedia/wikipedia"):
|
| 272 |
-
right_markdown = gr.Textbox(label="wikimedia/wikipedia extraction", lines=30)
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
# language_select = gr.Dropdown(choices=lang_list, value="en", label="Language")
|
| 280 |
-
# with gr.Column(scale=1):
|
| 281 |
-
# require_has_math = gr.Checkbox(label="Has math", value=False)
|
| 282 |
-
# require_has_infobox = gr.Checkbox(label="Has infobox", value=False)
|
| 283 |
-
# with gr.Row():
|
| 284 |
-
# with gr.Column():
|
| 285 |
-
# with gr.Row():
|
| 286 |
-
# prev_btn = gr.Button("Previous")
|
| 287 |
-
# next_btn = gr.Button("Next")
|
| 288 |
-
# with gr.Column():
|
| 289 |
-
# header_md = gr.Markdown()
|
| 290 |
-
# # with gr.Row():
|
| 291 |
-
# # id_input = gr.Textbox(label="Wikidata ID/URL/Page ID", placeholder="e.g., Q42 or https://... or 12345", lines=1)
|
| 292 |
-
# # find_btn = gr.Button("Find")
|
| 293 |
-
# # with gr.Row():
|
| 294 |
-
# # show_preview = gr.Checkbox(label="Show preview", value=True)
|
| 295 |
-
# # show_wiki = gr.Checkbox(label="Show `wikimedia/wikipedia` extraction", value=False)
|
| 296 |
-
# # show_infoboxes = gr.Checkbox(label="Show infoboxes", value=True)
|
| 297 |
-
# with gr.Row():
|
| 298 |
-
# with gr.Column():
|
| 299 |
-
# left_text = gr.Textbox(label="FineWiki extraction", lines=30)
|
| 300 |
-
# left_meta = gr.JSON(label="Metadata")
|
| 301 |
-
# with gr.Column():
|
| 302 |
-
# with gr.Tab("Preview"):
|
| 303 |
-
# right_iframe = gr.HTML(label="Original Page")
|
| 304 |
-
# with gr.Tab("wikimedia/wikipedia"):
|
| 305 |
-
# right_markdown = gr.Textbox(label="wikimedia/wikipedia extraction", lines=30)
|
| 306 |
-
# right_infoboxes = gr.JSON(label="Infoboxes")
|
| 307 |
-
|
| 308 |
-
language_select.change(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 309 |
-
demo.load(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 310 |
# find_btn.click(on_find, inputs=[docs_state, idx_state, iter_state, id_input, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 311 |
|
| 312 |
# Visibility toggles driven directly by checkbox changes
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
# Enable global queue to coordinate concurrent requests safely
|
| 320 |
demo.queue(default_concurrency_limit=1, max_size=128)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from datatrove.pipeline.readers import ParquetReader
|
| 3 |
from default_wiki_pipeline import _parse_and_clean_wikicode, mwparserfromhell
|
|
|
|
| 26 |
meta = doc.metadata or {}
|
| 27 |
if require_has_math and not bool(meta.get("has_math")):
|
| 28 |
return False
|
| 29 |
+
if require_has_infobox and not meta.get("infoboxes") and len(meta.get("infoboxes", [])) == 0:
|
| 30 |
return False
|
| 31 |
return True
|
| 32 |
|
|
|
|
| 230 |
return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
|
| 231 |
|
| 232 |
|
| 233 |
+
SCROLL_TO_TOP_JS = """
|
| 234 |
+
() => {
|
| 235 |
+
const ids = ["left_text_box", "right_markdown_box"];
|
| 236 |
+
for (const id of ids) {
|
| 237 |
+
const root = document.getElementById(id);
|
| 238 |
+
if (!root) continue;
|
| 239 |
+
const ta = root.querySelector('textarea');
|
| 240 |
+
if (ta) ta.scrollTop = 0;
|
| 241 |
+
}
|
| 242 |
+
}
|
| 243 |
+
"""
|
| 244 |
+
|
| 245 |
with gr.Blocks() as demo:
|
| 246 |
idx_state = gr.State(value=-1, time_to_live=900)
|
| 247 |
docs_state = gr.State(value=[], time_to_live=900)
|
|
|
|
| 255 |
with gr.Row():
|
| 256 |
prev_btn = gr.Button("Previous")
|
| 257 |
next_btn = gr.Button("Next")
|
|
|
|
|
|
|
| 258 |
|
| 259 |
with gr.Column():
|
| 260 |
with gr.Row():
|
|
|
|
| 263 |
with gr.Column(scale=1):
|
| 264 |
require_has_math = gr.Checkbox(label="Has math", value=False)
|
| 265 |
require_has_infobox = gr.Checkbox(label="Has infobox", value=False)
|
| 266 |
+
with gr.Row():
|
| 267 |
+
with gr.Column():
|
| 268 |
+
with gr.Tab("FineWiki"):
|
| 269 |
+
left_text = gr.Textbox(label="FineWiki extraction", lines=30, elem_id="left_text_box")
|
| 270 |
+
with gr.Tab("Metadata"):
|
| 271 |
+
left_meta = gr.JSON(label="Metadata")
|
| 272 |
+
with gr.Tab("Infoboxes"):
|
| 273 |
+
right_infoboxes = gr.JSON(label="Infoboxes")
|
| 274 |
+
|
| 275 |
+
with gr.Row():
|
| 276 |
+
prev_btn2 = gr.Button("Previous")
|
| 277 |
+
next_btn2 = gr.Button("Next")
|
| 278 |
+
with gr.Column():
|
| 279 |
with gr.Tab("Preview"):
|
| 280 |
right_iframe = gr.HTML(label="Original Page")
|
| 281 |
with gr.Tab("wikimedia/wikipedia"):
|
| 282 |
+
right_markdown = gr.Textbox(label="wikimedia/wikipedia extraction", lines=30, elem_id="right_markdown_box")
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
_ev1 = language_select.change(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 286 |
+
_ev1.then(js=SCROLL_TO_TOP_JS)
|
| 287 |
+
_ev2 = demo.load(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 288 |
+
_ev2.then(js=SCROLL_TO_TOP_JS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
# find_btn.click(on_find, inputs=[docs_state, idx_state, iter_state, id_input, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 290 |
|
| 291 |
# Visibility toggles driven directly by checkbox changes
|
| 292 |
+
_ev4 = prev_btn.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 293 |
+
_ev4.then(js=SCROLL_TO_TOP_JS)
|
| 294 |
+
_ev5 = next_btn.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 295 |
+
_ev5.then(js=SCROLL_TO_TOP_JS)
|
| 296 |
+
_ev4 = prev_btn2.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 297 |
+
_ev4.then(js=SCROLL_TO_TOP_JS)
|
| 298 |
+
_ev5 = next_btn2.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
|
| 299 |
+
_ev5.then(js=SCROLL_TO_TOP_JS)
|
| 300 |
|
| 301 |
# Enable global queue to coordinate concurrent requests safely
|
| 302 |
demo.queue(default_concurrency_limit=1, max_size=128)
|