File size: 3,125 Bytes
6d106b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import json
import subprocess
import tempfile
from pathlib import Path
from datetime import datetime, timezone
from huggingface_hub import HfApi
from modular_graph_and_candidates import (
build_graph_json,
generate_html,
build_timeline_json,
generate_timeline_html
)
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
CACHE_REPO = "Molbap/hf_cached_embeds_log"
THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
def main():
print(f"Building cache for {REPO_URL}")
print(f"Config: threshold={THRESH}, multimodal={MULTIMODAL}, method={SIM_METHOD}")
tmp = Path(tempfile.mkdtemp())
print(f"Working in {tmp}")
print("Cloning repository...")
subprocess.check_call([
"git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")
])
sha = subprocess.check_output([
"git", "rev-parse", "HEAD"
], cwd=tmp / "repo", text=True).strip()
print(f"Repository SHA: {sha}")
repo_path = tmp / "repo"
print("Building graph...")
graph = build_graph_json(
transformers_dir=repo_path,
threshold=THRESH,
multimodal=MULTIMODAL,
sim_method=SIM_METHOD,
)
print("Building timeline...")
timeline = build_timeline_json(
transformers_dir=repo_path,
threshold=THRESH,
multimodal=MULTIMODAL,
sim_method=SIM_METHOD,
)
print("Generating HTML...")
graph_html = generate_html(graph)
timeline_html = generate_timeline_html(timeline)
print(f"Uploading to {CACHE_REPO}...")
api = HfApi()
key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
latest = {
"sha": sha,
"updated_utc": datetime.now(timezone.utc).isoformat(),
"defaults": {
"sim_method": SIM_METHOD,
"threshold": THRESH,
"multimodal": MULTIMODAL
},
"paths": {
"graph_json": f"graph/{key}.json",
"graph_html": f"graph/{key}.html",
"timeline_json": f"timeline/{key}.json",
"timeline_html": f"timeline/{key}.html",
},
}
files_to_upload = [
(f"graph/{key}.json", json.dumps(graph, separators=(',', ':'))),
(f"graph/{key}.html", graph_html),
(f"timeline/{key}.json", json.dumps(timeline, separators=(',', ':'))),
(f"timeline/{key}.html", timeline_html),
("latest.json", json.dumps(latest, separators=(',', ':'))),
]
for path_in_repo, content in files_to_upload:
temp_file = tmp / "upload_temp"
temp_file.write_text(content, encoding="utf-8")
api.upload_file(
path_or_fileobj=str(temp_file),
path_in_repo=path_in_repo,
repo_id=CACHE_REPO,
commit_message=f"Cache update {sha[:7]} - {SIM_METHOD} t={THRESH} m={int(MULTIMODAL)}"
)
print(f"Uploaded {path_in_repo}")
print(f"Successfully uploaded cache for {key}")
if __name__ == "__main__":
main() |