|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import json |
|
|
import shutil |
|
|
import subprocess |
|
|
import tempfile |
|
|
from datetime import datetime, timedelta |
|
|
from functools import lru_cache |
|
|
from pathlib import Path |
|
|
import os, json, tempfile |
|
|
from pathlib import Path |
|
|
from huggingface_hub import hf_hub_download |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html |
|
|
|
|
|
def _escape_srcdoc(text: str) -> str: |
|
|
"""Escape for inclusion inside an <iframe srcdoc="β¦"> attribute.""" |
|
|
return ( |
|
|
text.replace("&", "&") |
|
|
.replace("\"", """) |
|
|
.replace("'", "'") |
|
|
.replace("<", "<") |
|
|
.replace(">", ">") |
|
|
) |
|
|
|
|
|
def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool): |
|
|
"""Fetch cached data from Molbap/hf_cached_embeds_log repo.""" |
|
|
|
|
|
repo_id = "Molbap/hf_cached_embeds_log" |
|
|
try: |
|
|
latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json") |
|
|
info = json.loads(Path(latest_fp).read_text(encoding="utf-8")) |
|
|
sha = info.get("sha") |
|
|
key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}" |
|
|
|
|
|
html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html") |
|
|
json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json") |
|
|
|
|
|
raw_html = Path(html_fp).read_text(encoding="utf-8") |
|
|
json_text = Path(json_fp).read_text(encoding="utf-8") |
|
|
|
|
|
iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>' |
|
|
tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1]) |
|
|
tmp.write_text(json_text, encoding="utf-8") |
|
|
return iframe_html, str(tmp) |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
HF_MAIN_REPO = "https://github.com/huggingface/transformers" |
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=4) |
|
|
def clone_or_cache(repo_url: str) -> Path: |
|
|
"""Shallowβclone *repo_url* and reuse it for 24β―h.""" |
|
|
tmp_root = Path(tempfile.gettempdir()) |
|
|
cache_dir = tmp_root / f"repo_{abs(hash(repo_url))}" |
|
|
stamp = cache_dir / ".cloned_at" |
|
|
|
|
|
if cache_dir.exists() and stamp.exists(): |
|
|
try: |
|
|
if datetime.utcnow() - datetime.fromisoformat(stamp.read_text().strip()) < timedelta(days=1): |
|
|
return cache_dir |
|
|
except Exception: |
|
|
pass |
|
|
shutil.rmtree(cache_dir, ignore_errors=True) |
|
|
|
|
|
subprocess.check_call(["git", "clone", "--depth", "1", repo_url, str(cache_dir)]) |
|
|
stamp.write_text(datetime.utcnow().isoformat()) |
|
|
return cache_dir |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str): |
|
|
"""Generate the dependency graph visualization.""" |
|
|
hit = _fetch_from_cache_repo("graph", sim_method, threshold, multimodal) |
|
|
if hit: |
|
|
return hit |
|
|
|
|
|
repo_path = clone_or_cache(repo_url) |
|
|
|
|
|
graph = build_graph_json( |
|
|
transformers_dir=repo_path, |
|
|
threshold=threshold, |
|
|
multimodal=multimodal, |
|
|
sim_method=sim_method, |
|
|
) |
|
|
|
|
|
raw_html = generate_html(graph) |
|
|
|
|
|
iframe_html = ( |
|
|
f'<iframe style="width:100%;height:85vh;border:none;" ' |
|
|
f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>' |
|
|
) |
|
|
|
|
|
tmp_json = Path(tempfile.mktemp(suffix=".json")) |
|
|
tmp_json.write_text(json.dumps(graph), encoding="utf-8") |
|
|
return iframe_html, str(tmp_json) |
|
|
|
|
|
def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str): |
|
|
"""Generate the chronological timeline visualization.""" |
|
|
hit = _fetch_from_cache_repo("timeline", sim_method, threshold, multimodal) |
|
|
if hit: |
|
|
return hit |
|
|
|
|
|
repo_path = clone_or_cache(repo_url) |
|
|
|
|
|
timeline = build_timeline_json( |
|
|
transformers_dir=repo_path, |
|
|
threshold=threshold, |
|
|
multimodal=multimodal, |
|
|
sim_method=sim_method, |
|
|
) |
|
|
|
|
|
raw_html = generate_timeline_html(timeline) |
|
|
|
|
|
iframe_html = ( |
|
|
f'<iframe style="width:100%;height:85vh;border:none;" ' |
|
|
f'srcdoc="{_escape_srcdoc(raw_html)}"></iframe>' |
|
|
) |
|
|
|
|
|
tmp_json = Path(tempfile.mktemp(suffix="_timeline.json")) |
|
|
tmp_json.write_text(json.dumps(timeline), encoding="utf-8") |
|
|
return iframe_html, str(tmp_json) |
|
|
|
|
|
|
|
|
|
|
|
CUSTOM_CSS = """ |
|
|
#graph_html iframe, #timeline_html iframe {height:85vh !important; width:100% !important; border:none;} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=CUSTOM_CSS) as demo: |
|
|
gr.Markdown("## π Modularβcandidate explorer for π€ Transformers") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Dependency Graph"): |
|
|
with gr.Row(): |
|
|
repo_in = gr.Text(value=HF_MAIN_REPO, label="Repo / fork URL") |
|
|
thresh = gr.Slider(0.50, 0.95, value=0.5, step=0.01, label="Similarity β₯") |
|
|
multi_cb = gr.Checkbox(label="Only multimodal models") |
|
|
sim_radio = gr.Radio(["jaccard", "embedding"], value="jaccard", label="Similarity metric") |
|
|
go_btn = gr.Button("Build graph") |
|
|
|
|
|
graph_html_out = gr.HTML(elem_id="graph_html", show_label=False) |
|
|
graph_json_out = gr.File(label="Download graph.json") |
|
|
|
|
|
go_btn.click(run_graph, [repo_in, thresh, multi_cb, sim_radio], [graph_html_out, graph_json_out]) |
|
|
|
|
|
with gr.Tab("Chronological Timeline"): |
|
|
with gr.Row(): |
|
|
timeline_repo_in = gr.Text(value=HF_MAIN_REPO, label="Repo / fork URL") |
|
|
timeline_thresh = gr.Slider(0.50, 0.95, value=0.5, step=0.01, label="Similarity β₯") |
|
|
timeline_multi_cb = gr.Checkbox(label="Only multimodal models") |
|
|
timeline_sim_radio = gr.Radio(["jaccard", "embedding"], value="jaccard", label="Similarity metric") |
|
|
timeline_btn = gr.Button("Build timeline") |
|
|
|
|
|
timeline_html_out = gr.HTML(elem_id="timeline_html", show_label=False) |
|
|
timeline_json_out = gr.File(label="Download timeline.json") |
|
|
|
|
|
timeline_btn.click(run_timeline, [timeline_repo_in, timeline_thresh, timeline_multi_cb, timeline_sim_radio], [timeline_html_out, timeline_json_out]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(allowed_paths=["static"]) |