Molbap HF Staff commited on
Commit
0b2b42c
·
1 Parent(s): dd4617e

try integration

Browse files
Files changed (1) hide show
  1. dist/index.html +69 -58
dist/index.html CHANGED
@@ -457,69 +457,80 @@ machinery is the <code>attention mask</code>, cause of confusion. Thankfully, we
457
  <li>having it immediately available to the community</li>
458
  <li>usable in vLLM, SGLang, and so on without additional code.</li>
459
  </ul>
460
- <p>## Inner cooking: Cache allocator</p>
461
- <p>Having a clean <em>external</em> API allows us to work on the true inner workings of transformers. One of the few recent additions was the <em>Cache pre-allocator</em> which improved massively the loading footprint.</p>
462
- <p><div style="border: 1px solid #e2e8f0; border-radius: 8px; background: white; margin: 1.5rem 0;">
463
- <div style="padding: 1rem; border-bottom: 1px solid #e2e8f0; background: #f8f9fa;">
464
- <h4 style="margin: 0 0 0.5rem 0; color: #495057;">🚀 Cache Pre-allocator Performance Demo</h4>
465
- <p style="margin: 0; font-size: 0.9em; color: #6c757d;">
466
- Compare model loading with and without transformers' caching allocator warmup. This demonstrates the memory efficiency improvements.
467
- </p>
468
  </div>
469
-
470
- <div style="padding: 1rem;">
471
- <div style="display: grid; grid-template-columns: 1fr auto; gap: 1rem; align-items: end; margin-bottom: 1.5rem;">
472
- <div>
473
- <label style="display: block; font-weight: 600; margin-bottom: 0.5rem; color: #374151;">Model to Profile:</label>
474
- <select id=memory-model-select style="width: 100%; padding: 0.5rem; border: 1px solid #d1d5db; border-radius: 6px; background: white;">
475
- <option value=openai-community/gpt2>openai-community/gpt2</option>
476
- <option value=google/gemma-2-2b>google/gemma-2-2b</option>
477
- <option value=microsoft/DialoGPT-small>microsoft/DialoGPT-small</option>
478
- <option value=facebook/opt-125m>facebook/opt-125m</option>
479
- </select>
480
- <div style="font-size: 0.8em; color: #6c757d; margin-top: 0.25rem;">
481
- Select a model or enter a custom HuggingFace model ID
482
- </div>
483
- </div>
484
-
485
- <div>
486
- <button id=memory-profile-btn style="padding: 0.75rem 1.5rem; background: #dc2626; color: white; border: none; border-radius: 6px; cursor: pointer; font-weight: 500;">
487
- 🔥 Profile Memory
488
- </button>
489
- </div>
490
- </div>
491
-
492
- <div id=memory-chart-container style="width: 100%; height: 400px; border: 1px solid #e2e8f0; border-radius: 6px; background: #f8f9fa; position: relative;">
493
- <div id=memory-placeholder style="position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); text-align: center; color: #6c757d; font-style: italic;">
494
- Click "Profile Memory" to generate memory allocation timeline
495
- </div>
496
- <canvas id=memory-chart width=100% height=400 style="display: none;"></canvas>
497
- </div>
498
-
499
- <div id=memory-stats style="margin-top: 1rem; padding: 1rem; background: #f1f5f9; border-radius: 6px; display: none;">
500
- <h5 style="margin: 0 0 0.5rem 0; color: #374151;">Memory Statistics</h5>
501
- <div id=memory-results></div>
502
- </div>
503
  </div>
504
-
505
- <div style="padding: 1rem; border-top: 1px solid #e2e8f0; background: #f8f9fa; font-size: 0.9em; color: #6c757d;">
506
- <strong>Note:</strong> This demo requires GPU access. The warmup feature reduces peak memory usage during model loading.
507
- In the original app, this uses ZeroGPU to measure actual memory allocation timelines.
508
  </div>
509
  </div>
510
 
511
- <script>document.addEventListener("DOMContentLoaded",function(){let e=document.getElementById("memory-model-select"),t=document.getElementById("memory-profile-btn"),l=document.getElementById("memory-chart-container"),o=document.getElementById("memory-placeholder"),n=document.getElementById("memory-chart"),i=document.getElementById("memory-stats"),m=document.getElementById("memory-results");t.addEventListener("click",function(){let d=e.value;t.disabled=!0,t.textContent="Profiling...",o.innerHTML='<div style="color: #6c757d;"><em>Loading model and measuring memory usage...</em><br><div style="margin-top: 0.5rem;">This may take a few moments</div></div>',i.style.display="none",setTimeout(()=>{let e=[],r=[],a=[];for(let t=0;t<=50;t++){let l=.1*t;e.push(l);let o=Math.max(0,500+15*Math.pow(t,1.5)+50*Math.random());r.push(o);let n=Math.max(0,600+18*Math.pow(t,1.8)+80*Math.random());a.push(n)}o.style.display="none",n.style.display="block";let s=n.getContext("2d"),y=n.width=l.offsetWidth-2,f=n.height=400;s.clearRect(0,0,y,f),s.strokeStyle="#d1d5db",s.beginPath(),s.moveTo(50,20),s.lineTo(50,f-50),s.lineTo(y-20,f-50),s.stroke(),s.strokeStyle="#f3f4f6";for(let e=1;e<10;e++){let t=20+(f-70)*e/10;s.beginPath(),s.moveTo(50,t),s.lineTo(y-20,t),s.stroke()}let g=Math.max(...a),c=(e,t)=>{s.strokeStyle=t,s.lineWidth=3,s.beginPath();for(let t=0;t<e.length;t++){let l=50+(y-70)*t/(e.length-1),o=f-50-(f-70)*e[t]/g;0===t?s.moveTo(l,o):s.lineTo(l,o)}s.stroke()};c(a,"#ef4444"),c(r,"#22c55e"),s.fillStyle="#374151",s.font="14px sans-serif",s.fillText("Memory (MiB)",10,f/2),s.fillText("Time (seconds)",y/2-50,f-10),s.fillStyle="#ef4444",s.fillRect(y-200,30,15,15),s.fillStyle="#374151",s.fillText("\uD83D\uDCC8 Warmup OFF (Standard)",y-180,42),s.fillStyle="#22c55e",s.fillRect(y-200,50,15,15),s.fillStyle="#374151",s.fillText("\uD83D\uDE80 Warmup ON (Optimized)",y-180,62);let h=Math.max(...r),u=Math.max(...a),p=(u-h)/u*100;m.innerHTML=`
512
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem;">
513
- <div>
514
- <strong>Peak Memory (Warmup OFF):</strong> ${u.toFixed(0)} MiB<br>
515
- <strong>Peak Memory (Warmup ON):</strong> ${h.toFixed(0)} MiB
516
- </div>
517
- <div>
518
- <strong>Memory Savings:</strong> ${p.toFixed(1)}%<br>
519
- <strong>Model:</strong> ${d}
520
- </div>
521
- </div>
522
- `,i.style.display="block",t.disabled=!1,t.textContent="\uD83D\uDD25 Profile Memory"},3e3)})})</script></p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  <h3>Linkedin post (to remove)</h3>
524
  <p>Linkedin post for videos:</p>
525
  <p>In transformers, how do we deal with cross-model dependencies, while supporting ~400 models? Maybe you’ve seen the same 200-lines functions in too many <em>modeling_file.py</em>? Duplication isn’t inevitable.</p>
 
457
  <li>having it immediately available to the community</li>
458
  <li>usable in vLLM, SGLang, and so on without additional code.</li>
459
  </ul>
460
+ <p>## Inner cooking: CUDA Warmup</p>
461
+ <p>Having a clean <em>external</em> API allows us to work on the true inner workings of transformers. One of the few recent additions was the <em>CUDA warmup</em> via <code>caching_allocator_warmup</code> which improved massively the loading footprint by pre-allocating GPU memory to avoid malloc bottlenecks during model loading.</p>
462
+ <div class=interactive-demo>
463
+ <div class=demo-header>
464
+ <h3>🚀 CUDA Warmup Efficiency Benchmark</h3>
 
 
 
465
  </div>
466
+ <div class=demo-content>
467
+ <iframe src=https://molbap-cuda-warmup-transformers.hf.space width=100% height=600px frameborder=0 style="border-radius: 8px; background: white;"></iframe>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  </div>
469
+ <div class=demo-footer>
470
+ Real CUDA warmup benchmarking with actual Transformers models. Measure the performance impact of the <code>caching_allocator_warmup</code> function at <code>transformers/src/transformers/modeling_utils.py:6186</code>. This interactive tool loads models twice - once with warmup disabled and once with warmup enabled - to demonstrate the significant loading time improvements.
 
 
471
  </div>
472
  </div>
473
 
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
530
+
531
+
532
+
533
+
534
  <h3>Linkedin post (to remove)</h3>
535
  <p>Linkedin post for videos:</p>
536
  <p>In transformers, how do we deal with cross-model dependencies, while supporting ~400 models? Maybe you’ve seen the same 200-lines functions in too many <em>modeling_file.py</em>? Duplication isn’t inevitable.</p>