Spaces:
Running
Running
| # build_cache.py | |
| import os | |
| import io | |
| import json | |
| import tarfile | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| from datetime import datetime, timezone | |
| from huggingface_hub import HfApi | |
| from modular_graph_and_candidates import ( | |
| build_graph_json, | |
| generate_html, | |
| build_timeline_json, | |
| generate_timeline_html, | |
| ) | |
| REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers") | |
| CACHE_REPO = "Molbap/hf_cached_embeds_log" | |
| MIN_THRESH = 0.1 | |
| MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"} | |
| SIM_METHOD = os.getenv("SIM_METHOD", "jaccard") | |
| MODULAR_CUTOFF_ISO = "2024-05-31" | |
| def _run(cwd: Path, *args: str) -> str: | |
| p = subprocess.run(["git", *args], cwd=cwd, text=True, capture_output=True, timeout=1200) | |
| if p.returncode != 0: | |
| raise RuntimeError(p.stderr.strip()[:400]) | |
| return p.stdout | |
| def _count_lines(text: str) -> int: | |
| return text.count("\n") + (1 if text and not text.endswith("\n") else 0) | |
| def _compute_loc_growth(repo: Path) -> dict: | |
| try: | |
| _run(repo, "fetch", "--unshallow", "--tags", "--prune") | |
| except Exception: | |
| _run(repo, "fetch", "--depth=100000", "--tags", "--prune") | |
| pathspec = "src/transformers/models" | |
| lines = _run(repo, "log", "--reverse", "--format=%H|%cI", "HEAD", "--", pathspec).splitlines() | |
| commits = [(ln.split("|", 1)[0], ln.split("|", 1)[1]) for ln in lines if "|" in ln] | |
| total = len(commits) | |
| if total > 500: | |
| step = max(1, total // 300) | |
| commits = commits[::step] | |
| out = [] | |
| for sha, date_iso in commits: | |
| proc = subprocess.run( | |
| ["git", "archive", sha, "--", pathspec], | |
| cwd=repo, capture_output=True, timeout=180 | |
| ) | |
| if proc.returncode != 0 or not proc.stdout: | |
| # Fallback: zero for this point; continue | |
| out.append({ | |
| "sha": sha, "date": date_iso, | |
| "loc_modeling_all": 0, "loc_modular": 0, | |
| "loc_modeling_included": 0, "effective_loc": 0, | |
| "n_models_with_modular": 0 | |
| }) | |
| continue | |
| buf = io.BytesIO(proc.stdout) | |
| modeling_by_model = {} | |
| modular_by_model = {} | |
| with tarfile.open(fileobj=buf, mode="r:*") as tar: | |
| for m in tar.getmembers(): | |
| if not m.isfile(): | |
| continue | |
| name = m.name | |
| if not name.endswith(".py"): | |
| continue | |
| if "/models/" not in name: | |
| continue | |
| parts = name.split("/") | |
| try: | |
| idx = parts.index("models") | |
| model = parts[idx + 1] if idx + 1 < len(parts) else "" | |
| except ValueError: | |
| model = "" | |
| if not model: | |
| continue | |
| if "/modeling_" in name or "/modular_" in name: | |
| f = tar.extractfile(m) | |
| if not f: | |
| continue | |
| try: | |
| txt = f.read().decode("utf-8", errors="ignore") | |
| finally: | |
| f.close() | |
| n = _count_lines(txt) | |
| if "/modular_" in name: | |
| modular_by_model[model] = modular_by_model.get(model, 0) + n | |
| elif "/modeling_" in name: | |
| modeling_by_model[model] = modeling_by_model.get(model, 0) + n | |
| modeling_all = sum(modeling_by_model.values()) | |
| modular_loc = sum(modular_by_model.values()) | |
| models_with_modular = set(modular_by_model.keys()) | |
| modeling_excluded = sum(modeling_by_model.get(m, 0) for m in models_with_modular) | |
| modeling_included = modeling_all - modeling_excluded | |
| effective = modeling_included + modular_loc | |
| out.append({ | |
| "sha": sha, | |
| "date": date_iso, | |
| "loc_modeling_all": modeling_all, | |
| "loc_modular": modular_loc, | |
| "loc_modeling_included": modeling_included, | |
| "effective_loc": effective, | |
| "n_models_with_modular": len(models_with_modular), | |
| }) | |
| return {"series": out, "cutoff": MODULAR_CUTOFF_ISO} | |
| def _loc_html(loc: dict) -> str: | |
| data = json.dumps(loc["series"], separators=(",", ":")) | |
| cutoff = loc["cutoff"] | |
| return f"""<!doctype html><meta charset=utf-8> | |
| <title>LOC growth</title> | |
| <div id=chart style="height:60vh;width:90vw;margin:2rem auto;"></div> | |
| <script src="https://cdn.jsdelivr.net/npm/apexcharts"></script> | |
| <script> | |
| const raw={data}; | |
| const xs=raw.map(d=>new Date(d.date).getTime()); | |
| const eff=raw.map(d=>d.effective_loc); | |
| const mod=raw.map(d=>d.loc_modular); | |
| const mdl_all=raw.map(d=>d.loc_modeling_all); | |
| const mdl_inc=raw.map(d=>d.loc_modeling_included); | |
| const cutoffTs=new Date("{cutoff}T00:00:00Z").getTime(); | |
| const opts={{ | |
| chart:{{type:"line",height:"100%"}}, | |
| series:[ | |
| {{name:"Effective LOC",data:xs.map((t,i)=>[t,eff[i]])}}, | |
| {{name:"Modular LOC",data:xs.map((t,i)=>[t,mod[i]])}}, | |
| {{name:"Modeling LOC (all)",data:xs.map((t,i)=>[t,mdl_all[i]])}}, | |
| {{name:"Modeling LOC (included)",data:xs.map((t,i)=>[t,mdl_inc[i]])}} | |
| ], | |
| xaxis:{{type:"datetime"}}, | |
| yaxis:{{labels:{{formatter:v=>Math.round(v)}}}}, | |
| stroke:{{width:2}}, | |
| tooltip:{{shared:true,x:{{format:"yyyy-MM-dd"}}}}, | |
| annotations:{{xaxis:[{{x:cutoffTs,borderColor:"#e11d48",label:{{text:"2024-05-31 modular",style:{{color:"#fff",background:"#e11d48"}}}}}}]}} | |
| }}; | |
| new ApexCharts(document.getElementById("chart"),opts).render(); | |
| </script>""" | |
| def main(): | |
| tmp = Path(tempfile.mkdtemp()) | |
| subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")]) | |
| sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip() | |
| repo_path = tmp / "repo" | |
| loc_growth = _compute_loc_growth(repo_path) | |
| loc_json_str = json.dumps(loc_growth, separators=(",", ":")) | |
| loc_html_str = _loc_html(loc_growth) | |
| graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) | |
| timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD) | |
| graph_html = generate_html(graph) | |
| timeline_html = generate_timeline_html(timeline) | |
| api = HfApi() | |
| api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True) | |
| key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}" | |
| latest = { | |
| "sha": sha, | |
| "updated_utc": datetime.now(timezone.utc).isoformat(), | |
| "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL}, | |
| "paths": { | |
| "graph_json": f"graph/{key}.json", | |
| "graph_html": f"graph/{key}.html", | |
| "timeline_json": f"timeline/{key}.json", | |
| "timeline_html": f"timeline/{key}.html", | |
| "loc_json": f"loc/{key}.json", | |
| "loc_html": f"loc/{key}.html", | |
| }, | |
| } | |
| def put(path_in_repo: str, text: str): | |
| api.upload_file( | |
| path_or_fileobj=io.BytesIO(text.encode("utf-8")), | |
| path_in_repo=path_in_repo, | |
| repo_id=CACHE_REPO, | |
| repo_type="dataset", | |
| commit_message=f"cache {path_in_repo}", | |
| ) | |
| put(f"graph/{key}.json", json.dumps(graph, separators=(",", ":"))) | |
| put(f"graph/{key}.html", graph_html) | |
| put(f"timeline/{key}.json", json.dumps(timeline, separators=(",", ":"))) | |
| put(f"timeline/{key}.html", timeline_html) | |
| put(f"loc/{key}.json", loc_json_str) | |
| put(f"loc/{key}.html", loc_html_str) | |
| put("latest.json", json.dumps(latest, separators=(",", ":"))) | |
| if __name__ == "__main__": | |
| main() | |