|
|
import os |
|
|
import sys |
|
|
import re |
|
|
import json |
|
|
import time |
|
|
import threading |
|
|
import subprocess |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import torch |
|
|
import spaces |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _make_md_markdownit(): |
|
|
|
|
|
from importlib import import_module |
|
|
from markdown_it import MarkdownIt |
|
|
md = MarkdownIt("gfm-like") |
|
|
|
|
|
|
|
|
foot_mod = import_module("mdit_py_plugins.footnote") |
|
|
foot = getattr(foot_mod, "footnote", None) or getattr(foot_mod, "footnote_plugin") |
|
|
md.use(foot) |
|
|
|
|
|
tl_mod = import_module("mdit_py_plugins.tasklists") |
|
|
tasklists = getattr(tl_mod, "tasklists", None) or getattr(tl_mod, "tasklists_plugin") |
|
|
md.use(tasklists) |
|
|
|
|
|
cont_mod = import_module("mdit_py_plugins.container") |
|
|
container = getattr(cont_mod, "container", None) or getattr(cont_mod, "container_plugin") |
|
|
try: |
|
|
md.use(container, "details") |
|
|
except TypeError: |
|
|
md.use(lambda m: container(m, name="details")) |
|
|
return md |
|
|
|
|
|
def _make_md_pythonmarkdown(): |
|
|
|
|
|
import markdown as md |
|
|
exts = [ |
|
|
"extra", |
|
|
"footnotes", |
|
|
"admonition", |
|
|
"toc", |
|
|
"pymdownx.details", |
|
|
"pymdownx.superfences", |
|
|
"pymdownx.tasklist", |
|
|
] |
|
|
ext_cfg = {"pymdownx.tasklist": {"custom_checkbox": True}, "toc": {"permalink": True}} |
|
|
return ("python-markdown", exts, ext_cfg, md) |
|
|
|
|
|
try: |
|
|
_md_engine = ("markdown-it", _make_md_markdownit()) |
|
|
except Exception: |
|
|
_md_engine = _make_md_pythonmarkdown() |
|
|
|
|
|
def _obsidian_rewrites(text: str) -> str: |
|
|
|
|
|
text = re.sub(r'!\[\[([^\]|]+)\]\]', r'', text) |
|
|
text = re.sub(r'\[\[([^\]|]+)\|([^\]]+)\]\]', r'[\2](\1)', text) |
|
|
text = re.sub(r'\[\[([^\]]+)\]\]', r'[\1](\1)', text) |
|
|
return text |
|
|
|
|
|
def md_to_html(text: str) -> str: |
|
|
text = _obsidian_rewrites(text) |
|
|
if _md_engine[0] == "markdown-it": |
|
|
md = _md_engine[1] |
|
|
return md.render(text) |
|
|
else: |
|
|
tag, exts, cfg, md = _md_engine |
|
|
return md.markdown(text, extensions=exts, extension_configs=cfg, output_format="html5") |
|
|
|
|
|
def render_article(md_path: str, inserts: dict[str, callable]): |
|
|
raw = "" |
|
|
path = Path(md_path) |
|
|
if path.exists(): |
|
|
raw = path.read_text(encoding="utf-8") |
|
|
else: |
|
|
raw = f"**Missing article**: `{md_path}` not found.\n\nCreate it in your Space repo." |
|
|
|
|
|
|
|
|
parts = re.split(r"\{\{([A-Z_]+)\}\}", raw) |
|
|
with gr.Column(): |
|
|
for i, part in enumerate(parts): |
|
|
if i % 2 == 0: |
|
|
gr.HTML(md_to_html(part)) |
|
|
else: |
|
|
build = inserts.get(part) |
|
|
if build is None: |
|
|
gr.HTML(f"<p><em>Unknown insert: {part}</em></p>") |
|
|
else: |
|
|
build() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_shell(cmd: str) -> str: |
|
|
banned = ["|", ">", "<", "&&", "||", "`"] |
|
|
if any(b in cmd for b in banned): |
|
|
return "$ " + cmd + "\nBlocked characters. Use a single command." |
|
|
try: |
|
|
p = subprocess.run(cmd, shell=True, check=False, capture_output=True, text=True, timeout=30) |
|
|
return f"$ {cmd}\n{p.stdout}{p.stderr}" |
|
|
except Exception as e: |
|
|
return f"$ {cmd}\n{e!r}" |
|
|
|
|
|
def build_terminal(): |
|
|
with gr.Group(): |
|
|
cmd = gr.Textbox(label="Command", value="python -c 'import torch; print(torch.__version__)'") |
|
|
run = gr.Button("Run") |
|
|
out = gr.Textbox(label="Output", lines=12, interactive=False) |
|
|
run.click(run_shell, inputs=cmd, outputs=out) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _import_attention_visualizer(): |
|
|
try: |
|
|
from transformers.utils.attention_visualizer import AttentionMaskVisualizer |
|
|
except Exception as e: |
|
|
raise RuntimeError( |
|
|
"AttentionMaskVisualizer is unavailable in this Transformers version." |
|
|
) from e |
|
|
return AttentionMaskVisualizer |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def render_attention_mask(model_id: str, prompt: str) -> str: |
|
|
try: |
|
|
AttentionMaskVisualizer = _import_attention_visualizer() |
|
|
vis = AttentionMaskVisualizer(model_id) |
|
|
out = vis(prompt) |
|
|
return str(out) |
|
|
except Exception as e: |
|
|
return f"<p>Attention visualizer error: {e}</p>" |
|
|
|
|
|
def build_attn_vis(): |
|
|
with gr.Group(): |
|
|
with gr.Row(): |
|
|
model = gr.Dropdown( |
|
|
label="Model", |
|
|
choices=["openai-community/gpt2", "google/gemma-2-2b"], |
|
|
value="openai-community/gpt2", |
|
|
allow_custom_value=True, |
|
|
) |
|
|
prompt = gr.Textbox(label="Prompt", value="You are an assistant. Make sure you print me.") |
|
|
go = gr.Button("Render") |
|
|
html = gr.HTML() |
|
|
go.click(render_attention_mask, inputs=[model, prompt], outputs=html) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoModelForCausalLM, modeling_utils as MU |
|
|
|
|
|
def _measure_load_timeline(model_id: str, disable_warmup: bool): |
|
|
orig = getattr(MU, "caching_allocator_warmup", None) |
|
|
if disable_warmup and orig is not None: |
|
|
MU.caching_allocator_warmup = lambda *a, **k: None |
|
|
try: |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
tl = [] |
|
|
|
|
|
def sample(start_t, stop_evt): |
|
|
while not stop_evt.is_set(): |
|
|
if device == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
alloc = torch.cuda.memory_allocated() |
|
|
else: |
|
|
alloc = 0 |
|
|
tl.append({"t": time.perf_counter() - start_t, "MiB": alloc / (1024**2)}) |
|
|
time.sleep(0.05) |
|
|
|
|
|
if device == "cuda": |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.reset_peak_memory_stats() |
|
|
|
|
|
start = time.perf_counter() |
|
|
stop_evt = threading.Event() |
|
|
th = threading.Thread(target=sample, args=(start, stop_evt), daemon=True) |
|
|
th.start() |
|
|
|
|
|
kwargs = {} |
|
|
if device == "cuda": |
|
|
kwargs.update(dict(torch_dtype=torch.float16, device_map="cuda:0", low_cpu_mem_usage=True)) |
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) |
|
|
|
|
|
stop_evt.set() |
|
|
th.join() |
|
|
|
|
|
if device == "cuda": |
|
|
torch.cuda.synchronize() |
|
|
tl.append({"t": time.perf_counter() - start, "MiB": torch.cuda.memory_allocated() / (1024**2)}) |
|
|
|
|
|
del model |
|
|
if device == "cuda": |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.ipc_collect() |
|
|
|
|
|
return tl |
|
|
finally: |
|
|
if orig is not None: |
|
|
MU.caching_allocator_warmup = orig |
|
|
|
|
|
@spaces.GPU(duration=240) |
|
|
def profile_warmup(model_id: str): |
|
|
on = _measure_load_timeline(model_id, disable_warmup=False) |
|
|
off = _measure_load_timeline(model_id, disable_warmup=True) |
|
|
rows = [{"t": r["t"], "MiB": r["MiB"], "mode": "warmup ON"} for r in on] + \ |
|
|
[{"t": r["t"], "MiB": r["MiB"], "mode": "warmup OFF"} for r in off] |
|
|
return pd.DataFrame(rows) |
|
|
|
|
|
def build_alloc_plot(): |
|
|
with gr.Group(): |
|
|
model = gr.Dropdown( |
|
|
label="Model", |
|
|
choices=["openai-community/gpt2", "google/gemma-2-2b"], |
|
|
value="openai-community/gpt2", |
|
|
allow_custom_value=True, |
|
|
) |
|
|
go = gr.Button("Run") |
|
|
plot = gr.LinePlot( |
|
|
x="t", y="MiB", color="mode", overlay_point=True, |
|
|
title="from_pretrained() load: time vs CUDA memory_allocated()", |
|
|
tooltip=["t", "MiB", "mode"], width=900, height=420 |
|
|
) |
|
|
go.click(profile_warmup, inputs=[model], outputs=plot) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from fastrtc import WebRTC, ReplyOnPause |
|
|
def _echo_video(frame): |
|
|
yield frame |
|
|
HAS_FASTRTC = True |
|
|
except Exception: |
|
|
HAS_FASTRTC = False |
|
|
|
|
|
def build_fastrtc(): |
|
|
if not HAS_FASTRTC: |
|
|
gr.Markdown("Install `fastrtc` to enable this section.") |
|
|
return |
|
|
with gr.Group(): |
|
|
gr.Markdown("Camera loopback using FastRTC WebRTC. Extend with streaming handlers later.") |
|
|
rtc = WebRTC(mode="send-receive", modality="video") |
|
|
rtc.stream(ReplyOnPause(_echo_video), inputs=[rtc], outputs=[rtc], time_limit=60) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
INSERTS = { |
|
|
"TERMINAL": build_terminal, |
|
|
"ATTN_VIS": build_attn_vis, |
|
|
"ALLOC_PLOT": build_alloc_plot, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSS = """ |
|
|
:root { --toc-w: 280px; } |
|
|
#layout { display: grid; grid-template-columns: var(--toc-w) 1fr; gap: 1.25rem; } |
|
|
#toc { position: sticky; top: 0.75rem; height: calc(100vh - 1.5rem); overflow: auto; padding-right: .5rem; } |
|
|
#toc a { text-decoration: none; display: block; padding: .25rem 0; } |
|
|
.section { scroll-margin-top: 72px; } |
|
|
.gradio-container { max-width: 1200px !important; margin: 0 auto; } |
|
|
hr { border: none; border-top: 1px solid var(--neutral-300); margin: 1.25rem 0; } |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=CSS, fill_height=True, title="Interactive Blog — Transformers Feature Showcase") as demo: |
|
|
gr.HTML("<h1>Transformers Feature Showcase</h1><p>Interactive, scrollable demo.</p>") |
|
|
with gr.Row(elem_id="layout"): |
|
|
with gr.Column(scale=0): |
|
|
gr.HTML( |
|
|
""" |
|
|
<nav id="toc"> |
|
|
<h3>Sections</h3> |
|
|
<a href="#article">Article</a> |
|
|
<a href="#rtc">FastRTC (preview)</a> |
|
|
</nav> |
|
|
""" |
|
|
) |
|
|
with gr.Column(): |
|
|
gr.HTML('<h2 id="article" class="section">Article</h2>') |
|
|
|
|
|
render_article("content/article.md", INSERTS) |
|
|
gr.HTML("<hr/>") |
|
|
|
|
|
gr.HTML('<h2 id="rtc" class="section">FastRTC (preview)</h2>') |
|
|
build_fastrtc() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|