Molbap HF Staff commited on
Commit
8b42971
·
1 Parent(s): ca3d404
app/dist/index.html CHANGED
@@ -12,8 +12,8 @@
12
  document.documentElement.setAttribute("data-theme", theme);
13
  } catch {}
14
  })();
15
- </script><script type="module" src="/scripts/color-palettes.js"></script><!-- TO MANAGE PROPERLY --><script src="https://cdn.plot.ly/plotly-3.0.0.min.js" charset="utf-8"></script><link rel="stylesheet" href="/_astro/index.C8LanvBP.css"><script type="module" src="/_astro/hoisted.DK-CdsVg.js"></script>
16
- <script type="module" src="/_astro/page.CH0W_C1Z.js"></script></head> <body> <button id="theme-toggle" aria-label="Toggle color theme" data-astro-cid-x3pjskd3> <svg class="icon light" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <circle cx="12" cy="12" r="5" data-astro-cid-x3pjskd3></circle> <line x1="12" y1="1" x2="12" y2="4" data-astro-cid-x3pjskd3></line> <line x1="12" y1="20" x2="12" y2="23" data-astro-cid-x3pjskd3></line> <line x1="1" y1="12" x2="4" y2="12" data-astro-cid-x3pjskd3></line> <line x1="20" y1="12" x2="23" y2="12" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="4.22" x2="6.34" y2="6.34" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="17.66" x2="19.78" y2="19.78" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="19.78" x2="6.34" y2="17.66" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="6.34" x2="19.78" y2="4.22" data-astro-cid-x3pjskd3></line> </svg> <svg class="icon dark" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" data-astro-cid-x3pjskd3></path> </svg> </button> <section class="hero" data-astro-cid-bbe6dxrz> <h1 class="hero-title" data-astro-cid-bbe6dxrz>Maintain the unmaintainable:<br/>1M python loc, 400+ models</h1> <div class="hero-banner" data-astro-cid-bbe6dxrz> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-o4iyfn0453e"><style>
17
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@500;600&display=swap');
18
 
19
  .banner-container {
@@ -463,7 +463,7 @@ We continue to support all new models and expect to do so for the foreseeable fu
463
  <p>We formalize and articulate the “tenets” that have been guiding our development, demonstrate how they are implemented in code, and show the measurable impact they have on the library’s sustainability and growth.</p>
464
  <p>For any OSS maintainer, power user, or contributor, this is the map to understanding, using, and building upon <code>transformers</code>, but not only: any project of comparable size will require you to make deep choices, not only on design and choice of abstraction, but on the very mindset of the software you are building. These tenets may or may not be applicable to your project, but they provide a glimpse on how we work that could be helpful or inspirational.</p>
465
  <p>Conventions used throughout this post:</p>
466
- <p><span class="glossary-term " style="" data-tooltip-id="tenet-ym9dzwu0hm" onmouseenter="window.showTenetTooltip(event, 'tenet-ym9dzwu0hm')" onmousemove="window.updateTenetTooltip(event, 'tenet-ym9dzwu0hm')" onmouseleave="window.hideTenetTooltip('tenet-ym9dzwu0hm')">Tenets exemplified</span><span id="tenet-ym9dzwu0hm" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">source-of-truth</span> <span class="glossary-tooltip__definition">Model implementations should be reliable, reproducible, and faithful to original performances.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
467
  if (!window.tenetTooltipInitialized) {
468
  window.tenetTooltipInitialized = true;
469
 
@@ -500,7 +500,7 @@ if (!window.tenetTooltipInitialized) {
500
  <p>These principles were not decided in a vacuum. The library <em>evolved</em> towards them, and once they <em>emerged</em>, they were recognized as critical.</p>
501
  <div class="tenet-list"><ol><li class="tenet"><a id="source-of-truth"></a><strong>Source of Truth</strong><p>We aim be the <a href="https://huggingface.co/blog/transformers-model-definition">source of truth for all model definitions</a>. This is not a tenet, but something that guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original performances.</p><em>This overarching guideline ensures quality and reproducibility across all models in the library.</em></li><li class="tenet"><a id="one-model-one-file"></a><strong>One Model, One File</strong><p>All inference and training core logic has to be visible, top‑to‑bottom, to maximize each model’s hackability.</p><em>Every model should be understandable and hackable by reading a single file from top to bottom.</em></li><li class="tenet"><a id="code-is-product"></a><strong>Code is the Product</strong><p>Optimize for reading, diff-ing, and tweaking, our users are power users. Variables can be explicit, full words, even several words, readability is primordial.</p><em>Code quality matters as much as functionality - optimize for human readers, not just computers.</em></li><li class="tenet"><a id="standardize-dont-abstract"></a><strong>Standardize, Don’t Abstract</strong><p>If it’s model behavior, keep it in the file; abstractions are only for generic infra.</p><em>Model-specific logic belongs in the model file, not hidden behind abstractions.</em></li><li class="tenet"><a id="do-repeat-yourself"></a><strong>DRY* (DO Repeat Yourself)</strong><p>Copy when it helps users; keep successors in sync without centralizing behavior.</p><p><b>Evolution:</b></p><p> With the introduction and global adoption of <a href="#modular">modular</a> transformers, we do not repeat any logic in the modular files, but end user files remain faithful to the original tenet.</p><em>Strategic duplication can improve readability and maintainability when done thoughtfully.</em></li><li class="tenet"><a id="minimal-user-api"></a><strong>Minimal User API</strong><p>Config, model, pre-processing; from_pretrained, save_pretrained, push_to_hub. We want the least amount of codepaths. Reading should be obvious, configurations should be obvious.</p><em>Keep the public interface simple and predictable, users should know what to expect.</em></li><li class="tenet"><a id="backwards-compatibility"></a><strong>Backwards Compatibility</strong><p>Evolve by additive standardization, never break public APIs.</p><p>Any artifact that was once on the hub and worked with transformers should be usable indefinitely with the same interface. Further, public methods should not change to avoid breaking dependencies.</p><em>Once something is public, it stays public, evolution through addition, not breaking changes.</em></li><li class="tenet"><a id="consistent-public-surface"></a><strong>Consistent Public Surface</strong><p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests. This is a goal as well as a tenet.</p><em>All models should feel familiar - consistent interfaces reduce cognitive load.</em></li></ol></div>
502
  <p>When a PR is merged, it is because the contribution is worthwhile, and because the <code>transformers</code> team finds the design of the contribution to be aligned with the tenets.</p>
503
- <p>Does all the code in the library strictly follow these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere, built by thousands of different workers. We <em>try</em> to make it so all the code added is compliant, because if we fail and merge it, we cannot change it lest we break <span class="glossary-term " style="" data-tooltip-id="tenet-fth5973nnjd" onmouseenter="window.showTenetTooltip(event, 'tenet-fth5973nnjd')" onmousemove="window.updateTenetTooltip(event, 'tenet-fth5973nnjd')" onmouseleave="window.hideTenetTooltip('tenet-fth5973nnjd')">backwards compatibility</span><span id="tenet-fth5973nnjd" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">backwards-compatibility</span> <span class="glossary-tooltip__definition">Any artifact once on the hub must remain loadable. Breaking changes are unacceptable.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
504
  if (!window.tenetTooltipInitialized) {
505
  window.tenetTooltipInitialized = true;
506
 
@@ -529,7 +529,7 @@ if (!window.tenetTooltipInitialized) {
529
  }
530
  </script>.</p>
531
  <p>To see what constitutes adherence to the tenets, let’s take the example of code repetition.</p>
532
- <p>The following function, essential to the implementation of <a href="https://huggingface.co/papers/2104.09864">Rotary Positional Embeddings</a> can be found in more than 70 <code>modeling_&lt;file&gt;.py</code> across <code>src/transformers/models/.</code> Why keep it? Because we want all the model logic to be <span class="glossary-term " style="" data-tooltip-id="tenet-tnlbzvjsne" onmouseenter="window.showTenetTooltip(event, 'tenet-tnlbzvjsne')" onmousemove="window.updateTenetTooltip(event, 'tenet-tnlbzvjsne')" onmouseleave="window.hideTenetTooltip('tenet-tnlbzvjsne')">contained in the modeling file</span><span id="tenet-tnlbzvjsne" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
533
  if (!window.tenetTooltipInitialized) {
534
  window.tenetTooltipInitialized = true;
535
 
@@ -556,7 +556,7 @@ if (!window.tenetTooltipInitialized) {
556
  tooltip.style.opacity = '0';
557
  };
558
  }
559
- </script>. In order to do that, we <span class="glossary-term " style="" data-tooltip-id="tenet-409mffns78b" onmouseenter="window.showTenetTooltip(event, 'tenet-409mffns78b')" onmousemove="window.updateTenetTooltip(event, 'tenet-409mffns78b')" onmouseleave="window.hideTenetTooltip('tenet-409mffns78b')">do repeat ourselves</span><span id="tenet-409mffns78b" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
560
  if (!window.tenetTooltipInitialized) {
561
  window.tenetTooltipInitialized = true;
562
 
@@ -593,7 +593,7 @@ if (!window.tenetTooltipInitialized) {
593
  <p>We want all models to have self-contained modeling code.</p>
594
  <p>Each core functionality <em>must</em> be in the modeling code, every non-core functionality <em>can</em> be outside of it.</p>
595
  <p>This comes as a great cost. Enter the <code>#Copied from...</code> mechanism: for a long time, these comments were indicating that some code was copied from another model, saving time both for the reviewers and for the CI. But the LOC count kept creeping up. Each new model copied over hundreds of lines that we considered largely boilerplate, yet, we could not remove them.</p>
596
- <p>We need to separate both principles that were so far intertwined, <span class="glossary-term " style="" data-tooltip-id="tenet-pm2uiuidmd" onmouseenter="window.showTenetTooltip(event, 'tenet-pm2uiuidmd')" onmousemove="window.updateTenetTooltip(event, 'tenet-pm2uiuidmd')" onmouseleave="window.hideTenetTooltip('tenet-pm2uiuidmd')">repetition</span><span id="tenet-pm2uiuidmd" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
597
  if (!window.tenetTooltipInitialized) {
598
  window.tenetTooltipInitialized = true;
599
 
@@ -620,7 +620,7 @@ if (!window.tenetTooltipInitialized) {
620
  tooltip.style.opacity = '0';
621
  };
622
  }
623
- </script> and <span class="glossary-term " style="" data-tooltip-id="tenet-vimyl99oxt" onmouseenter="window.showTenetTooltip(event, 'tenet-vimyl99oxt')" onmousemove="window.updateTenetTooltip(event, 'tenet-vimyl99oxt')" onmouseleave="window.hideTenetTooltip('tenet-vimyl99oxt')">hackabilty</span><span id="tenet-vimyl99oxt" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
624
  if (!window.tenetTooltipInitialized) {
625
  window.tenetTooltipInitialized = true;
626
 
@@ -649,7 +649,7 @@ if (!window.tenetTooltipInitialized) {
649
  }
650
  </script>.</p>
651
  <p>What’s the solution to this?</p>
652
- <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p><strong>TL;DR:</strong> Read the code in one place, <span class="glossary-term " style="" data-tooltip-id="tenet-6er2h0aupnn" onmouseenter="window.showTenetTooltip(event, 'tenet-6er2h0aupnn')" onmousemove="window.updateTenetTooltip(event, 'tenet-6er2h0aupnn')" onmouseleave="window.hideTenetTooltip('tenet-6er2h0aupnn')">one model, one file.</span><span id="tenet-6er2h0aupnn" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
653
  if (!window.tenetTooltipInitialized) {
654
  window.tenetTooltipInitialized = true;
655
 
@@ -678,7 +678,7 @@ if (!window.tenetTooltipInitialized) {
678
  }
679
  </script>. Keep semantics local (<a href="#standardize-dont-abstract">Standardize, Don’t Abstract</a>). Allow strategic duplication for end users (<a href="#do-repeat-yourself">DRY*</a>). Keep the public surface minimal and stable (<a href="#minimal-user-api">Minimal API</a>, <a href="#backwards-compatibility">Backwards Compatibility</a>, <a href="#consistent-public-surface">Consistent Surface</a>).</p><p><strong>Next:</strong> how modular transformers honor these while removing boilerplate.</p> </div> </div> </div> </div>
680
  <h2 id="-modular-transformers"><a href="#-modular-transformers"><a id="modular"></a> Modular transformers</a></h2>
681
- <p>Transformers is an opinionated library. The previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, and the <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post</a> were already pointing at the drawbacks mentioned just above, which have been iteratively addressed. <a href="https://huggingface.co/docs/transformers/en/modular_transformers"><code>modular</code> transformers was introduced</a> to allow a form of inheritance without breaking <span class="glossary-term " style="" data-tooltip-id="tenet-9kn7la3fzhq" onmouseenter="window.showTenetTooltip(event, 'tenet-9kn7la3fzhq')" onmousemove="window.updateTenetTooltip(event, 'tenet-9kn7la3fzhq')" onmouseleave="window.hideTenetTooltip('tenet-9kn7la3fzhq')">the one model, one file rule.</span><span id="tenet-9kn7la3fzhq" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
682
  if (!window.tenetTooltipInitialized) {
683
  window.tenetTooltipInitialized = true;
684
 
@@ -706,7 +706,7 @@ if (!window.tenetTooltipInitialized) {
706
  };
707
  }
708
  </script></p>
709
- <p>We amended the principle of <span class="glossary-term " style="" data-tooltip-id="tenet-3geh90wsft" onmouseenter="window.showTenetTooltip(event, 'tenet-3geh90wsft')" onmousemove="window.updateTenetTooltip(event, 'tenet-3geh90wsft')" onmouseleave="window.hideTenetTooltip('tenet-3geh90wsft')">DRY*</span><span id="tenet-3geh90wsft" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
710
  if (!window.tenetTooltipInitialized) {
711
  window.tenetTooltipInitialized = true;
712
 
@@ -869,7 +869,7 @@ The modular file can use inheritance across models: and then, it is unravelled i
869
  <span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"># ... (many more classes and functions would follow)</span></span>
870
  <span class="line"></span></code></pre></div></div> </div> </div> </div> <figcaption class="reference__caption" data-astro-cid-e5g6tzce><strong>Left:</strong> Clean modular definition with inheritance. <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.</figcaption> </figure> </div>
871
  <p>As you can see, we can define a new model as a <em>modular</em> combination of fragments taken from others.</p>
872
- <p>You might think “well that’s just how inheritance works”. The crucial difference is that we do <em>visibly</em> what is essentially the <em>compiler</em>’s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it <span class="glossary-term " style="" data-tooltip-id="tenet-5p5jmuox22h" onmouseenter="window.showTenetTooltip(event, 'tenet-5p5jmuox22h')" onmousemove="window.updateTenetTooltip(event, 'tenet-5p5jmuox22h')" onmouseleave="window.hideTenetTooltip('tenet-5p5jmuox22h')">all in one piece.</span><span id="tenet-5p5jmuox22h" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
873
  if (!window.tenetTooltipInitialized) {
874
  window.tenetTooltipInitialized = true;
875
 
@@ -898,7 +898,7 @@ if (!window.tenetTooltipInitialized) {
898
  }
899
  </script></p>
900
  <p>You can see below the difference between <code>GlmAttention</code> and <code>LlamaAttention</code>, with the latter having been copied with minimal changes.</p>
901
- <div class="wide"> <div class="ri-root" data-ri-root="ri_r0egysgalc7" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/llama_glm_attn.D7pkKjAT_1axKuC.webp" alt="Llama vs GLM" data-zoomable="1" data-astro-cid-6kov3kig width="2169" height="482" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 1:</strong> Comparison of attention implementations between Llama and GLM, showing code reuse with minimal modifications.</span> </figcaption> </figure> </div> <script>
902
  (() => {
903
  const scriptEl = document.currentScript;
904
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -1069,7 +1069,7 @@ if (!window.tenetTooltipInitialized) {
1069
  <p>When <code>AutoModel.from_pretrained(...)</code> is called, it is indeed the modeling (right side) that is run, and all the tests run on the modeling code.</p>
1070
  <p>More importantly, the auto-generated modeling file is what users <em>read</em> to understand the code, what they step through in their debuggers and what they hack for their needs.</p>
1071
  <p>What does that give us?</p>
1072
- <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p><strong>TL;DR:</strong> A small <code>modular_*.py</code> declares reuse; the expanded modeling file stays visible and <span class="glossary-term " style="" data-tooltip-id="tenet-unvgenigv1f" onmouseenter="window.showTenetTooltip(event, 'tenet-unvgenigv1f')" onmousemove="window.updateTenetTooltip(event, 'tenet-unvgenigv1f')" onmouseleave="window.hideTenetTooltip('tenet-unvgenigv1f')">unique</span><span id="tenet-unvgenigv1f" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1073
  if (!window.tenetTooltipInitialized) {
1074
  window.tenetTooltipInitialized = true;
1075
 
@@ -1103,7 +1103,7 @@ However, if a model has a modular_<em>.py and a corresponding automatically gene
1103
  <p>That gives an “effective LOC” curve: the 𝗺𝗮𝗶𝗻𝘁𝗲𝗻𝗮𝗻𝗰𝗲 𝘀𝘂𝗿𝗳𝗮𝗰𝗲.</p>
1104
  <p>Measured on git history, raw <code>modeling_*.py</code> grew at ~362 LOC/day before modular; counting only modular shards yields ~25 LOC/day after — about <strong>15× lower</strong>. The effective curve (blue line below) represents the <strong>maintenance surface</strong> today: what maintainers actually read and review.</p>
1105
  <p>Less code to hand-maintain means fewer places to break. Naturally LOC is not a direct measure of complexity, but they correlate in review effort and change risk.</p>
1106
- <figure class="html-embed"><div class="html-embed__card"><div id="frag-ucyel2wxiw"><div class="d3-loc-growth"></div>
1107
  <style>
1108
  .d3-loc-growth { position: relative; }
1109
 
@@ -1524,7 +1524,7 @@ If you zoom in, you’ll notice there’s a sharp drop near the end, it’s esse
1524
  <p>But this was not the only effort that allowed us to reduce maintenance load.</p>
1525
  <p>We recently underwent a deep refactor of the attention implementation. You’ve likely heard about <a href="https://huggingface.co/docs/text-generation-inference/en/conceptual/flash_attention">flash attention</a> and its several variants.</p>
1526
  <p>The <em>attention computation</em> itself happens at a <em>lower</em> level of abstraction than the model itself.</p>
1527
- <p>However, we were adding specific torch operations for each backend (sdpa, the several flash-attention iterations, flex attention) but it isn’t a <span class="glossary-term " style="" data-tooltip-id="tenet-ahxekio8evg" onmouseenter="window.showTenetTooltip(event, 'tenet-ahxekio8evg')" onmousemove="window.updateTenetTooltip(event, 'tenet-ahxekio8evg')" onmouseleave="window.hideTenetTooltip('tenet-ahxekio8evg')">minimal user api</span><span id="tenet-ahxekio8evg" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1528
  if (!window.tenetTooltipInitialized) {
1529
  window.tenetTooltipInitialized = true;
1530
 
@@ -1557,7 +1557,7 @@ if (!window.tenetTooltipInitialized) {
1557
  <p>The solution for the “attention abstraction problem” was to move to a standard <a href="https://huggingface.co/docs/transformers/en/attention_interface">attention interface</a> that allows the following:</p>
1558
  <p>The naive implementation of attention, called “eager”, is available by default. We use a <code>Callable</code> called <code>eager_attention_forward</code>, which can run as long as the user has PyTorch installed – which is a requirement any way.</p>
1559
  <p>Instead of using a class interface and a class hierarchy, we just moved to a function interface. When a more complex attention implementation is needed, we use other Callables, including much faster kernel bindings when available. The decision to use a different attention implementation is based on the model configuration file we download from the Hub, and it can also be overridden by the user.</p>
1560
- <p>This is a clear example that that we prefer an interface that is <span class="glossary-term " style="" data-tooltip-id="tenet-c5a6chz6c6d" onmouseenter="window.showTenetTooltip(event, 'tenet-c5a6chz6c6d')" onmousemove="window.updateTenetTooltip(event, 'tenet-c5a6chz6c6d')" onmouseleave="window.hideTenetTooltip('tenet-c5a6chz6c6d')">standard, but not abstract</span><span id="tenet-c5a6chz6c6d" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">standardize-dont-abstract</span> <span class="glossary-tooltip__definition">Model-specific logic belongs in the model file, not hidden behind abstractions.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1561
  if (!window.tenetTooltipInitialized) {
1562
  window.tenetTooltipInitialized = true;
1563
 
@@ -1589,7 +1589,7 @@ if (!window.tenetTooltipInitialized) {
1589
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">if</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">!=</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> &quot;eager&quot;</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">:</span></span>
1590
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_interface </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ALL_ATTENTION_FUNCTIONS</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation]</span></span>
1591
  <span class="line"></span></code></pre></div>
1592
- <p>Having the attention interfaces functionalized allows to do dynamic switching of attentions as well, increasing their <span class="glossary-term " style="" data-tooltip-id="tenet-sjpqlwx647l" onmouseenter="window.showTenetTooltip(event, 'tenet-sjpqlwx647l')" onmousemove="window.updateTenetTooltip(event, 'tenet-sjpqlwx647l')" onmouseleave="window.hideTenetTooltip('tenet-sjpqlwx647l')">hackability</span><span id="tenet-sjpqlwx647l" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">code-is-product</span> <span class="glossary-tooltip__definition">Optimize for reading, diffing, and tweaking. Code quality matters as much as functionality.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1593
  if (!window.tenetTooltipInitialized) {
1594
  window.tenetTooltipInitialized = true;
1595
 
@@ -1619,7 +1619,7 @@ if (!window.tenetTooltipInitialized) {
1619
  </script>.
1620
  Another strength of the new attention interface is the possibility to enforce specific kwargs, which are needed by kernel providers and other dependencies.</p>
1621
  <p>Backend integrations sometimes require specific kwargs.</p>
1622
- <p>We know that kwargs are often a necessary evil that plagues tools with widespread compatibility; and it is something we have aimed to reduce, and continue to reduce in order to improve readability - with them, the current system is a <span class="glossary-term " style="" data-tooltip-id="tenet-uxs2p0cua4j" onmouseenter="window.showTenetTooltip(event, 'tenet-uxs2p0cua4j')" onmousemove="window.updateTenetTooltip(event, 'tenet-uxs2p0cua4j')" onmouseleave="window.hideTenetTooltip('tenet-uxs2p0cua4j')">minimal user api</span><span id="tenet-uxs2p0cua4j" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1623
  if (!window.tenetTooltipInitialized) {
1624
  window.tenetTooltipInitialized = true;
1625
 
@@ -1659,7 +1659,7 @@ if (!window.tenetTooltipInitialized) {
1659
  <p>Why does it matter?</p>
1660
  <p>Because we want to avoid code modifications that are unrelated to the model.</p>
1661
  <p>We choose to place the level of abstraction higher than the device placement: a matrix multiplication - a <code>nn.Linear</code> layer - should be always expressed in the same way, regardless of how it is placed.</p>
1662
- <p>Hence, we want to touch the modeling code <span class="glossary-term " style="" data-tooltip-id="tenet-qdy0wa2dou9" onmouseenter="window.showTenetTooltip(event, 'tenet-qdy0wa2dou9')" onmousemove="window.updateTenetTooltip(event, 'tenet-qdy0wa2dou9')" onmouseleave="window.hideTenetTooltip('tenet-qdy0wa2dou9')">as little as possible</span><span id="tenet-qdy0wa2dou9" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1663
  if (!window.tenetTooltipInitialized) {
1664
  window.tenetTooltipInitialized = true;
1665
 
@@ -1736,7 +1736,7 @@ if (!window.tenetTooltipInitialized) {
1736
  <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> &quot;full_attention&quot;</span></span>
1737
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> ],</span></span>
1738
  <span class="line"></span></code></pre></div>
1739
- <p>This is <span class="glossary-term " style="" data-tooltip-id="tenet-r267pg2wbvr" onmouseenter="window.showTenetTooltip(event, 'tenet-r267pg2wbvr')" onmousemove="window.updateTenetTooltip(event, 'tenet-r267pg2wbvr')" onmouseleave="window.hideTenetTooltip('tenet-r267pg2wbvr')">minimal</span><span id="tenet-r267pg2wbvr" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1740
  if (!window.tenetTooltipInitialized) {
1741
  window.tenetTooltipInitialized = true;
1742
 
@@ -1766,7 +1766,7 @@ if (!window.tenetTooltipInitialized) {
1766
  </script> to implement on the user side, and allows to keep the modeling code untouched. It is also easy to tweak.</p>
1767
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Allowed layer types are explicit; schedules (e.g., sliding/full alternation) live in config. This keeps the file readable and easy to tweak.</p><p><strong>Next:</strong> speedups come from kernels that don’t change semantics.</p> </div> </div> </div> </div>
1768
  <h3 id="community-kernels"><a href="#community-kernels"><a id="community-kernels"></a>Community Kernels</a></h3>
1769
- <p>The same principle extends to normalization, activation, and other code paths. The model defines <strong>semantics</strong>; a kernel defines <strong>how</strong> to execute them faster. We annotate the module to borrow a community‑provided forward, keeping a <span class="glossary-term " style="" data-tooltip-id="tenet-c5px9hp60fn" onmouseenter="window.showTenetTooltip(event, 'tenet-c5px9hp60fn')" onmousemove="window.updateTenetTooltip(event, 'tenet-c5px9hp60fn')" onmouseleave="window.hideTenetTooltip('tenet-c5px9hp60fn')">consistent public surface</span><span id="tenet-c5px9hp60fn" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">consistent-public-surface</span> <span class="glossary-tooltip__definition">Uniform naming, signatures, and conventions across all models for predictability.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1770
  if (!window.tenetTooltipInitialized) {
1771
  window.tenetTooltipInitialized = true;
1772
 
@@ -1814,7 +1814,7 @@ So I want to take a look at the current <strong>state of modularity</strong> acr
1814
  <p>So what do we see?</p>
1815
  <p>(Graph reading guide: nodes are models; edges are modular imports).</p>
1816
  <p>Check out the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer here</a> (tab “dependency graph���, hit “build graph”) for better manipulation and exploration.</p>
1817
- <div class="wide"> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-9iivci8cass"><iframe
1818
  src="https://molbap-dependencies-1.hf.space"
1819
  style="width:100%; height:680px; border:0"
1820
  allow="clipboard-read; clipboard-write; fullscreen"
@@ -1822,7 +1822,7 @@ referrerpolicy="no-referrer-when-downgrade"
1822
  ></iframe></div></div></figure> </div>
1823
  <p>Let’s walk through some sections of this graph together.
1824
  First, Llama is a basis and an influence for many models, and it is very visible.</p>
1825
- <div class="ri-root" data-ri-root="ri_2om34k6gbvj" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/llama_center.CbQ5MyAc_ZraNCd.webp" alt="Llama in the center" data-zoomable="1" data-astro-cid-6kov3kig width="1030" height="1015" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 2:</strong> Llama as a central model influencing many other models in the dependency graph.</span> </figcaption> </figure> </div> <script>
1826
  (() => {
1827
  const scriptEl = document.currentScript;
1828
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -1991,7 +1991,7 @@ First, Llama is a basis and an influence for many models, and it is very visible
1991
  </script>
1992
  <p>The models linked sometimes pull components from other models than <code>llama</code> of course. Radically different architectures such as mamba have spawned their own dependency subgraph.</p>
1993
  <p>Audio models form sparser archipelagos, see for instance wav2vec2 which is a significant basis for a dozen of them.</p>
1994
- <div class="ri-root" data-ri-root="ri_t4m3ao383y" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/cluster_wave2vec2.BvHBUP61_Z2vdkmW.webp" alt="Wav2vec2 influence" data-zoomable="1" data-astro-cid-6kov3kig width="608" height="563" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 3:</strong> Cluster of audio architectures based on wav2vec2, forming a specialized archipelago.</span> </figcaption> </figure> </div> <script>
1995
  (() => {
1996
  const scriptEl = document.currentScript;
1997
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -2160,7 +2160,7 @@ First, Llama is a basis and an influence for many models, and it is very visible
2160
  </script>
2161
  <p>In the case of VLMs which have massively grown in popularity since 2024, there’s far too many vision-based architectures that are not yet defined as modulars of other existing archs. In other words, there is no strong reference point in terms of software for vision models.</p>
2162
  <p>As you can see, there is a small <code>DETR</code> island:</p>
2163
- <div class="ri-root" data-ri-root="ri_z9a3knabcy" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/detr_island.CSrqELWy_1IAFDR.webp" alt="DETR archipelago" data-zoomable="1" data-astro-cid-6kov3kig width="591" height="606" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 4:</strong> Small DETR archipelago for vision models, less centralized than Llama for text.</span> </figcaption> </figure> </div> <script>
2164
  (() => {
2165
  const scriptEl = document.currentScript;
2166
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -2329,7 +2329,7 @@ First, Llama is a basis and an influence for many models, and it is very visible
2329
  </script>
2330
  <p>There is also a little llava pocket, and so on, but it’s not comparable to the centrality observed for llama.</p>
2331
  <p>Another problem is, this visualization only shows <code>modular</code> models. Several models still do NOT have a modular file. If we zoom out significantly, we can see them, the red nodes are models that do not have a modular file yet.</p>
2332
- <div class="ri-root" data-ri-root="ri_a89st80qoma" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/big_picture_zoomout.BKwXtSkj_1bNS6U.webp" alt="Red nodes" data-zoomable="1" data-astro-cid-6kov3kig width="1043" height="972" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 5:</strong> Overview showing red nodes (models without modular files) to be modularized.</span> </figcaption> </figure> </div> <script>
2333
  (() => {
2334
  const scriptEl = document.currentScript;
2335
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -2503,14 +2503,14 @@ First, Llama is a basis and an influence for many models, and it is very visible
2503
  <p>Next, I looked into Jaccard similarity, which we use to measure set differences. I know that code is more than a set of characters stringed together. I also used code embedding models to check out code similarities, and it yielded better results, for the needs of this blog post I will stick to Jaccard index.</p>
2504
  <p>It is interesting, for that, to look at <em>when</em> we deployed this modular logic and what was its rippling effect on the library. You can check the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">larger space</a> to play around, but the gist is: adding modular allowed to connect more and more models to solid reference points. We have a lot of gaps to fill in still.</p>
2505
  <p>Zoom out below - it’s full of models. You can click on a node to see its connections better, or use the text box to search for a model. You can use the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer</a> (tab “timeline”, hit “build timeline”) for better exploration.</p>
2506
- <div class="wide"> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-pcs7vltpqzd"> <iframe
2507
  src="https://molbap-timeline-1.hf.space"
2508
  style="width:100%; height:680px; border:0"
2509
  allow="clipboard-read; clipboard-write; fullscreen"
2510
  referrerpolicy="no-referrer-when-downgrade"
2511
  ></iframe></div></div></figure> </div>
2512
  <p>Let’s look at a few highly connected models. Let’s start by the foundational work of <a href="https://arxiv.org/abs/2304.08485">Llava</a>.</p>
2513
- <div class="ri-root" data-ri-root="ri_6sus0h798um" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/timeline_llava.Bne5RSo9_Z26WEKX.webp" alt="Llava in its timeline" data-zoomable="1" data-astro-cid-6kov3kig width="1250" height="770" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 6:</strong> LLaVA and its variants in the timeline, with llava_video as a candidate for modularization.</span> </figcaption> </figure> </div> <script>
2514
  (() => {
2515
  const scriptEl = document.currentScript;
2516
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -2677,7 +2677,7 @@ First, Llama is a basis and an influence for many models, and it is very visible
2677
  else window.addEventListener("load", initZoomIfNeeded, { once: true });
2678
  })();
2679
  </script>
2680
- <p>You see that <code>llava_video</code> is a red node, connected by a red edge to <code>llava</code>: it’s a candidate, something that we can <em>likely</em> remodularize, <span class="glossary-term " style="" data-tooltip-id="tenet-zse5l3kirdh" onmouseenter="window.showTenetTooltip(event, 'tenet-zse5l3kirdh')" onmousemove="window.updateTenetTooltip(event, 'tenet-zse5l3kirdh')" onmouseleave="window.hideTenetTooltip('tenet-zse5l3kirdh')">not touching the actual model</span><span id="tenet-zse5l3kirdh" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">backwards-compatibility</span> <span class="glossary-tooltip__definition">Any artifact once on the hub must remain loadable. Breaking changes are unacceptable.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
2681
  if (!window.tenetTooltipInitialized) {
2682
  window.tenetTooltipInitialized = true;
2683
 
@@ -2704,7 +2704,7 @@ if (!window.tenetTooltipInitialized) {
2704
  tooltip.style.opacity = '0';
2705
  };
2706
  }
2707
- </script> but being much more readable with <span class="glossary-term " style="" data-tooltip-id="tenet-u7bhzgqp8l" onmouseenter="window.showTenetTooltip(event, 'tenet-u7bhzgqp8l')" onmousemove="window.updateTenetTooltip(event, 'tenet-u7bhzgqp8l')" onmouseleave="window.hideTenetTooltip('tenet-u7bhzgqp8l')">DRY*</span><span id="tenet-u7bhzgqp8l" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
2708
  if (!window.tenetTooltipInitialized) {
2709
  window.tenetTooltipInitialized = true;
2710
 
@@ -2734,7 +2734,7 @@ if (!window.tenetTooltipInitialized) {
2734
  </script>.</p>
2735
  <p>The same can be identified with the classical encoders family, centered on <code>BERT</code>:</p>
2736
  <p>Here <code>roberta</code>, <code>xlm_roberta</code>, <code>ernie</code> are <code>modular</code>s of BERT, while models like <code>mobilebert</code> are likely candidates.</p>
2737
- <div class="ri-root" data-ri-root="ri_iwi4nbt0ifj" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/classic_encoders.BSgQl9lp_3OtlT.webp" alt="Classical encoders" data-zoomable="1" data-astro-cid-6kov3kig width="1274" height="749" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 7:</strong> Family of classical encoders centered on BERT, with several models already modularized.</span> </figcaption> </figure> </div> <script>
2738
  (() => {
2739
  const scriptEl = document.currentScript;
2740
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -2908,7 +2908,7 @@ if (!window.tenetTooltipInitialized) {
2908
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> InputsEmbeddingMixerMixin</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span>
2909
  <span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"> #</span></span>
2910
  <span class="line"></span></code></pre></div>
2911
- <p>But this is <span class="glossary-term " style="" data-tooltip-id="tenet-36b3cbmnfad" onmouseenter="window.showTenetTooltip(event, 'tenet-36b3cbmnfad')" onmousemove="window.updateTenetTooltip(event, 'tenet-36b3cbmnfad')" onmouseleave="window.hideTenetTooltip('tenet-36b3cbmnfad')">not an abstraction</span><span id="tenet-36b3cbmnfad" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">standardize-dont-abstract</span> <span class="glossary-tooltip__definition">Model-specific logic belongs in the model file, not hidden behind abstractions.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
2912
  if (!window.tenetTooltipInitialized) {
2913
  window.tenetTooltipInitialized = true;
2914
 
@@ -2938,7 +2938,7 @@ if (!window.tenetTooltipInitialized) {
2938
  </script>. Embedding mixin is part of the model, removing it would break it. A user opening <a href="https://github.com/huggingface/transformers/blob/b3bd815786c36f4e6c3791fae0a96cac86658b32/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1358"><code>modeling_qwen2.5_vl</code></a> (check out the <a href="https://huggingface.co/collections/Qwen/qwen25-vl-6795ffac22b334a837c0f9a5">Qwen2.5VL collection</a>) should not have to go to another file to understand how it works.</p>
2939
  <p>What is the current state of these “abstractions” across the codebase?
2940
  You will see all the imports around a modeling file, here <a href="https://huggingface.co/google/gemma-3n-E4B-it">Gemma3n</a>.</p>
2941
- <div class="ri-root" data-ri-root="ri_ubhninn8exh" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/still_graph_bloat.BII6Am4a_Z2d5KVT.webp" alt="Gemma3n graph" data-zoomable="1" data-astro-cid-6kov3kig width="2580" height="1207" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 8:</strong> Gemma3n import graph showing dependency complexity, with GenerationMixin very central.</span> </figcaption> </figure> </div> <script>
2942
  (() => {
2943
  const scriptEl = document.currentScript;
2944
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -3148,7 +3148,7 @@ That means every decision we make to abstract something else has to be extremely
3148
  <span class="line"></span>
3149
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask, special_video_mask</span></span>
3150
  <span class="line"></span></code></pre></div>
3151
- <p>But this is <em>within</em> the modeling file, not in the <code>PreTrainedModel</code> base class. It does not move away from it, because it’d break the <span class="glossary-term " style="" data-tooltip-id="tenet-rvdzvy9xioc" onmouseenter="window.showTenetTooltip(event, 'tenet-rvdzvy9xioc')" onmousemove="window.updateTenetTooltip(event, 'tenet-rvdzvy9xioc')" onmouseleave="window.hideTenetTooltip('tenet-rvdzvy9xioc')">One model, one file tenet.</span><span id="tenet-rvdzvy9xioc" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
3152
  if (!window.tenetTooltipInitialized) {
3153
  window.tenetTooltipInitialized = true;
3154
 
@@ -3182,7 +3182,7 @@ if (!window.tenetTooltipInitialized) {
3182
  <h3 id="on-image-processing-and-processors"><a href="#on-image-processing-and-processors">On image processing and processors</a></h3>
3183
  <p>Deciding to become a <code>torch</code>-first library meant relieving a tremendous amount of support for <code>jax </code> and <code>TensorFlow</code>, and it also meant that we could be more lenient about the amount of torch-dependent utilities that we were able to accept. One of these is the <em>fast processing</em> of images. Where inputs were once minimally assumed to be ndarrays, enforcing native <code>torch</code> and <code>torchvision</code> inputs allowed us to massively improve processing speed for each model.</p>
3184
  <p>The gains in performance are immense, up to 20x speedup for most models when using compiled torchvision ops. Furthermore, let us run the whole pipeline solely on GPU.</p>
3185
- <div class="ri-root" data-ri-root="ri_3tz2otjw5ur" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/fast_image_processors.D3x5vY3o_2cacGa.webp" alt="Fast Image Processors Performance" data-zoomable="1" data-astro-cid-6kov3kig width="2251" height="2409" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 9:</strong> Performance gains of fast image processors, up to 20x acceleration with compiled torchvision.</span> </figcaption> </figure> </div> <script>
3186
  (() => {
3187
  const scriptEl = document.currentScript;
3188
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -3358,7 +3358,7 @@ if (!window.tenetTooltipInitialized) {
3358
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>The shape of a contribution: add a model (or variant) with a small modular shard; the community and serving stacks pick it up immediately. Popularity trends (encoders/embeddings) guide where we invest.</p><p><strong>Next:</strong> power tools enabled by a consistent API.</p> </div> </div> </div> </div>
3359
  <h3 id="-models-popularity"><a href="#-models-popularity"><a id="encoders-ftw"></a> Models popularity</a></h3>
3360
  <p>Talking about dependencies, we can take a look at the number of downloads as a measure of popularity. One thing we see is the prominence of encoders, despite the apparent prevalence of decoder LLMs. The reason is that encoders are used to generate embeddings, which have multiple downstream uses. Just check out <a href="https://huggingface.co/blog/embeddinggemma">EmbeddingGemma</a> for a modern recap. Hence, it is vital to keep the encoders portion of the library viable, usable, fine-tunable.</p>
3361
- <div><figure class="html-embed"><div class="html-embed__card"><div id="frag-9u1amg9s1vl"><div class="d3-model-popularity"></div>
3362
  <style>
3363
  .d3-model-popularity .controls {
3364
  margin-top: 0;
@@ -3934,7 +3934,7 @@ if (!window.tenetTooltipInitialized) {
3934
  </script>
3935
  </div></div></figure></div>
3936
  <p>As the codebase grows, we need to maintain it in coordination with our friend <a href="https://huggingface.co/sentence-transformers">Sentence Transformers codebase</a>. Retrieval use-cases, smart databases, FAISS-based indexing rely on it, and thus indirectly on transformers.</p>
3937
- <p>In that regard, we DO want to be a modular toolbox, being <span class="glossary-term " style="" data-tooltip-id="tenet-dt98i1rub1" onmouseenter="window.showTenetTooltip(event, 'tenet-dt98i1rub1')" onmousemove="window.updateTenetTooltip(event, 'tenet-dt98i1rub1')" onmouseleave="window.hideTenetTooltip('tenet-dt98i1rub1')">minimal</span><span id="tenet-dt98i1rub1" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
3938
  if (!window.tenetTooltipInitialized) {
3939
  window.tenetTooltipInitialized = true;
3940
 
@@ -3970,7 +3970,7 @@ if (!window.tenetTooltipInitialized) {
3970
  <p>All models have the same API for attention computation, thanks to <a href="#external-attention-classes">the externalisation of attention classes</a>.</p>
3971
  <p>This uniformity allows us to build cool tools to visualize the inner workings of the attention mechanism.</p>
3972
  <p>One particular piece of machinery is the <code>attention mask</code>. Here you see the famous bidirectional attention pattern for the whole prefix (text + image) in PaliGemma and all Gemma2+ models, contrasting with the usual “causal-only” models.</p>
3973
- <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-c6v5zoguff"><!-- Minimal HTML fragment: terminal-style ASCII attention masks -->
3974
  <div style="max-width: 940px; margin: 16px 0; border:1px solid #2a2f3a; border-radius:8px; background:#0b0f19; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; color:#e5e7eb;">
3975
  <div style="display:flex; align-items:center; gap:8px; padding:8px 10px; border-bottom:1px solid #1f2430; background:#111827; border-top-left-radius:8px; border-top-right-radius:8px;">
3976
  <span style="width:10px; height:10px; background:#ef4444; border-radius:50%; display:inline-block;"></span>
@@ -4018,7 +4018,7 @@ if (!window.tenetTooltipInitialized) {
4018
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Uniform attention APIs enable cross-model diagnostics (e.g., PaliGemma prefix bidirectionality vs causal).</p><p><strong>Next:</strong> whole-model tracing for ports and regressions.</p> </div> </div> </div> </div>
4019
  <h3 id="logging-entire-model-activations"><a href="#logging-entire-model-activations">Logging entire model activations</a></h3>
4020
  <p>Because everything is PyTorch, we can easily <a href="https://huggingface.co/docs/transformers/internal/model_debugging_utils">debug any model</a> when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.</p>
4021
- <p>It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, to match our <span class="glossary-term " style="" data-tooltip-id="tenet-fcbcszg3k9h" onmouseenter="window.showTenetTooltip(event, 'tenet-fcbcszg3k9h')" onmousemove="window.updateTenetTooltip(event, 'tenet-fcbcszg3k9h')" onmouseleave="window.hideTenetTooltip('tenet-fcbcszg3k9h')">Source of Truth guideline</span><span id="tenet-fcbcszg3k9h" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">source-of-truth</span> <span class="glossary-tooltip__definition">Model implementations should be reliable, reproducible, and faithful to original performances.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
4022
  if (!window.tenetTooltipInitialized) {
4023
  window.tenetTooltipInitialized = true;
4024
 
@@ -4046,7 +4046,7 @@ if (!window.tenetTooltipInitialized) {
4046
  };
4047
  }
4048
  </script>.</p>
4049
- <div class="wide"> <div class="ri-root" data-ri-root="ri_qp16ts48ezb" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/model_debugger.DouWEpKv_Z2338YE.webp" alt="Model debugger interface" data-zoomable="1" data-astro-cid-6kov3kig width="2053" height="1016" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 10:</strong> Model debugger interface intercepting calls and logging statistics in nested JSON.</span> </figcaption> </figure> </div> <script>
4050
  (() => {
4051
  const scriptEl = document.currentScript;
4052
  const root = scriptEl ? scriptEl.previousElementSibling : null;
@@ -4215,7 +4215,7 @@ if (!window.tenetTooltipInitialized) {
4215
  </script> </div>
4216
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Forward interception and nested JSON logging align ports to reference implementations, reinforcing “Source of Truth.”</p><p><strong>Next:</strong> CUDA warmup reduces load-time without touching modeling semantics.</p> </div> </div> </div> </div>
4217
  <h3 id="cooking-faster-cuda-warmups"><a href="#cooking-faster-cuda-warmups">Cooking faster CUDA warmups</a></h3>
4218
- <p>Having a clean <em>external</em> API allows us to work on the <span class="glossary-term " style="" data-tooltip-id="tenet-rtw2agqta2i" onmouseenter="window.showTenetTooltip(event, 'tenet-rtw2agqta2i')" onmousemove="window.updateTenetTooltip(event, 'tenet-rtw2agqta2i')" onmouseleave="window.hideTenetTooltip('tenet-rtw2agqta2i')">true inner workings of transformers</span><span id="tenet-rtw2agqta2i" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">code-is-product</span> <span class="glossary-tooltip__definition">Optimize for reading, diffing, and tweaking. Code quality matters as much as functionality.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
4219
  if (!window.tenetTooltipInitialized) {
4220
  window.tenetTooltipInitialized = true;
4221
 
@@ -4243,7 +4243,7 @@ if (!window.tenetTooltipInitialized) {
4243
  };
4244
  }
4245
  </script>. One of a few recent additions is the <em>CUDA warmup</em> via <code>caching_allocator_warmup</code>, which dramatically improves loading times by pre-allocating GPU memory to avoid malloc bottlenecks during model loading. It can achieve a 7x speedup factor for an 8B model, or 6x for a 32B one, as you can check in <a href="https://github.com/huggingface/transformers/pull/36380">the PR</a>!</p>
4246
- <div class="wide"> <figure class="html-embed"><figcaption class="html-embed__title" style="text-align:center">Mem allocation patterns during model loading</figcaption><div class="html-embed__card"><div id="frag-p5cjipn59zo"><div class="d3-warmup-demo"></div>
4247
  <style>
4248
  .d3-warmup-demo {
4249
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
@@ -4896,7 +4896,7 @@ if (!window.tenetTooltipInitialized) {
4896
  <li>having it immediately usable in vLLM, <a href="https://huggingface.co/blog/transformers-backend-sglang">SGLang</a>, and so on without additional code. In the case of vLLM, transformers was added as a backend to run models on vLLM, which optimizes throughput/latency on top of <em>existing</em> transformers architectures <a href="https://blog.vllm.ai/2025/04/11/transformers-backend.html">as seen in this great vLLM x HF blog post.</a></li>
4897
  <li>being the reference code for implementations in MLX, llama.cpp and other libraries.</li>
4898
  </ul>
4899
- <p>This further cements the need for a <span class="glossary-term " style="" data-tooltip-id="tenet-1bht5f89nnv" onmouseenter="window.showTenetTooltip(event, 'tenet-1bht5f89nnv')" onmousemove="window.updateTenetTooltip(event, 'tenet-1bht5f89nnv')" onmouseleave="window.hideTenetTooltip('tenet-1bht5f89nnv')">consistent public surface</span><span id="tenet-1bht5f89nnv" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">consistent-public-surface</span> <span class="glossary-tooltip__definition">Uniform naming, signatures, and conventions across all models for predictability.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
4900
  if (!window.tenetTooltipInitialized) {
4901
  window.tenetTooltipInitialized = true;
4902
 
 
12
  document.documentElement.setAttribute("data-theme", theme);
13
  } catch {}
14
  })();
15
+ </script><script type="module" src="/scripts/color-palettes.js"></script><!-- TO MANAGE PROPERLY --><script src="https://cdn.plot.ly/plotly-3.0.0.min.js" charset="utf-8"></script><link rel="stylesheet" href="/_astro/index.BxhJlHED.css"><script type="module" src="/_astro/hoisted.DK-CdsVg.js"></script>
16
+ <script type="module" src="/_astro/page.CH0W_C1Z.js"></script></head> <body> <button id="theme-toggle" aria-label="Toggle color theme" data-astro-cid-x3pjskd3> <svg class="icon light" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <circle cx="12" cy="12" r="5" data-astro-cid-x3pjskd3></circle> <line x1="12" y1="1" x2="12" y2="4" data-astro-cid-x3pjskd3></line> <line x1="12" y1="20" x2="12" y2="23" data-astro-cid-x3pjskd3></line> <line x1="1" y1="12" x2="4" y2="12" data-astro-cid-x3pjskd3></line> <line x1="20" y1="12" x2="23" y2="12" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="4.22" x2="6.34" y2="6.34" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="17.66" x2="19.78" y2="19.78" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="19.78" x2="6.34" y2="17.66" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="6.34" x2="19.78" y2="4.22" data-astro-cid-x3pjskd3></line> </svg> <svg class="icon dark" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" data-astro-cid-x3pjskd3></path> </svg> </button> <section class="hero" data-astro-cid-bbe6dxrz> <h1 class="hero-title" data-astro-cid-bbe6dxrz>Maintain the unmaintainable:<br/>1M python loc, 400+ models</h1> <div class="hero-banner" data-astro-cid-bbe6dxrz> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-r882c37wmp"><style>
17
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@500;600&display=swap');
18
 
19
  .banner-container {
 
463
  <p>We formalize and articulate the “tenets” that have been guiding our development, demonstrate how they are implemented in code, and show the measurable impact they have on the library’s sustainability and growth.</p>
464
  <p>For any OSS maintainer, power user, or contributor, this is the map to understanding, using, and building upon <code>transformers</code>, but not only: any project of comparable size will require you to make deep choices, not only on design and choice of abstraction, but on the very mindset of the software you are building. These tenets may or may not be applicable to your project, but they provide a glimpse on how we work that could be helpful or inspirational.</p>
465
  <p>Conventions used throughout this post:</p>
466
+ <p><span class="glossary-term " style="" data-tooltip-id="tenet-h0i1lsq19gq" onmouseenter="window.showTenetTooltip(event, 'tenet-h0i1lsq19gq')" onmousemove="window.updateTenetTooltip(event, 'tenet-h0i1lsq19gq')" onmouseleave="window.hideTenetTooltip('tenet-h0i1lsq19gq')">Tenets exemplified</span><span id="tenet-h0i1lsq19gq" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">source-of-truth</span> <span class="glossary-tooltip__definition">Model implementations should be reliable, reproducible, and faithful to original performances.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
467
  if (!window.tenetTooltipInitialized) {
468
  window.tenetTooltipInitialized = true;
469
 
 
500
  <p>These principles were not decided in a vacuum. The library <em>evolved</em> towards them, and once they <em>emerged</em>, they were recognized as critical.</p>
501
  <div class="tenet-list"><ol><li class="tenet"><a id="source-of-truth"></a><strong>Source of Truth</strong><p>We aim be the <a href="https://huggingface.co/blog/transformers-model-definition">source of truth for all model definitions</a>. This is not a tenet, but something that guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original performances.</p><em>This overarching guideline ensures quality and reproducibility across all models in the library.</em></li><li class="tenet"><a id="one-model-one-file"></a><strong>One Model, One File</strong><p>All inference and training core logic has to be visible, top‑to‑bottom, to maximize each model’s hackability.</p><em>Every model should be understandable and hackable by reading a single file from top to bottom.</em></li><li class="tenet"><a id="code-is-product"></a><strong>Code is the Product</strong><p>Optimize for reading, diff-ing, and tweaking, our users are power users. Variables can be explicit, full words, even several words, readability is primordial.</p><em>Code quality matters as much as functionality - optimize for human readers, not just computers.</em></li><li class="tenet"><a id="standardize-dont-abstract"></a><strong>Standardize, Don’t Abstract</strong><p>If it’s model behavior, keep it in the file; abstractions are only for generic infra.</p><em>Model-specific logic belongs in the model file, not hidden behind abstractions.</em></li><li class="tenet"><a id="do-repeat-yourself"></a><strong>DRY* (DO Repeat Yourself)</strong><p>Copy when it helps users; keep successors in sync without centralizing behavior.</p><p><b>Evolution:</b></p><p> With the introduction and global adoption of <a href="#modular">modular</a> transformers, we do not repeat any logic in the modular files, but end user files remain faithful to the original tenet.</p><em>Strategic duplication can improve readability and maintainability when done thoughtfully.</em></li><li class="tenet"><a id="minimal-user-api"></a><strong>Minimal User API</strong><p>Config, model, pre-processing; from_pretrained, save_pretrained, push_to_hub. We want the least amount of codepaths. Reading should be obvious, configurations should be obvious.</p><em>Keep the public interface simple and predictable, users should know what to expect.</em></li><li class="tenet"><a id="backwards-compatibility"></a><strong>Backwards Compatibility</strong><p>Evolve by additive standardization, never break public APIs.</p><p>Any artifact that was once on the hub and worked with transformers should be usable indefinitely with the same interface. Further, public methods should not change to avoid breaking dependencies.</p><em>Once something is public, it stays public, evolution through addition, not breaking changes.</em></li><li class="tenet"><a id="consistent-public-surface"></a><strong>Consistent Public Surface</strong><p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests. This is a goal as well as a tenet.</p><em>All models should feel familiar - consistent interfaces reduce cognitive load.</em></li></ol></div>
502
  <p>When a PR is merged, it is because the contribution is worthwhile, and because the <code>transformers</code> team finds the design of the contribution to be aligned with the tenets.</p>
503
+ <p>Does all the code in the library strictly follow these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere, built by thousands of different workers. We <em>try</em> to make it so all the code added is compliant, because if we fail and merge it, we cannot change it lest we break <span class="glossary-term " style="" data-tooltip-id="tenet-8lnt56up8pj" onmouseenter="window.showTenetTooltip(event, 'tenet-8lnt56up8pj')" onmousemove="window.updateTenetTooltip(event, 'tenet-8lnt56up8pj')" onmouseleave="window.hideTenetTooltip('tenet-8lnt56up8pj')">backwards compatibility</span><span id="tenet-8lnt56up8pj" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">backwards-compatibility</span> <span class="glossary-tooltip__definition">Any artifact once on the hub must remain loadable. Breaking changes are unacceptable.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
504
  if (!window.tenetTooltipInitialized) {
505
  window.tenetTooltipInitialized = true;
506
 
 
529
  }
530
  </script>.</p>
531
  <p>To see what constitutes adherence to the tenets, let’s take the example of code repetition.</p>
532
+ <p>The following function, essential to the implementation of <a href="https://huggingface.co/papers/2104.09864">Rotary Positional Embeddings</a> can be found in more than 70 <code>modeling_&lt;file&gt;.py</code> across <code>src/transformers/models/.</code> Why keep it? Because we want all the model logic to be <span class="glossary-term " style="" data-tooltip-id="tenet-tcoiwbg2pmd" onmouseenter="window.showTenetTooltip(event, 'tenet-tcoiwbg2pmd')" onmousemove="window.updateTenetTooltip(event, 'tenet-tcoiwbg2pmd')" onmouseleave="window.hideTenetTooltip('tenet-tcoiwbg2pmd')">contained in the modeling file</span><span id="tenet-tcoiwbg2pmd" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
533
  if (!window.tenetTooltipInitialized) {
534
  window.tenetTooltipInitialized = true;
535
 
 
556
  tooltip.style.opacity = '0';
557
  };
558
  }
559
+ </script>. In order to do that, we <span class="glossary-term " style="" data-tooltip-id="tenet-2zku8u6i30g" onmouseenter="window.showTenetTooltip(event, 'tenet-2zku8u6i30g')" onmousemove="window.updateTenetTooltip(event, 'tenet-2zku8u6i30g')" onmouseleave="window.hideTenetTooltip('tenet-2zku8u6i30g')">do repeat ourselves</span><span id="tenet-2zku8u6i30g" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
560
  if (!window.tenetTooltipInitialized) {
561
  window.tenetTooltipInitialized = true;
562
 
 
593
  <p>We want all models to have self-contained modeling code.</p>
594
  <p>Each core functionality <em>must</em> be in the modeling code, every non-core functionality <em>can</em> be outside of it.</p>
595
  <p>This comes as a great cost. Enter the <code>#Copied from...</code> mechanism: for a long time, these comments were indicating that some code was copied from another model, saving time both for the reviewers and for the CI. But the LOC count kept creeping up. Each new model copied over hundreds of lines that we considered largely boilerplate, yet, we could not remove them.</p>
596
+ <p>We need to separate both principles that were so far intertwined, <span class="glossary-term " style="" data-tooltip-id="tenet-nm3ehf4q1wb" onmouseenter="window.showTenetTooltip(event, 'tenet-nm3ehf4q1wb')" onmousemove="window.updateTenetTooltip(event, 'tenet-nm3ehf4q1wb')" onmouseleave="window.hideTenetTooltip('tenet-nm3ehf4q1wb')">repetition</span><span id="tenet-nm3ehf4q1wb" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
597
  if (!window.tenetTooltipInitialized) {
598
  window.tenetTooltipInitialized = true;
599
 
 
620
  tooltip.style.opacity = '0';
621
  };
622
  }
623
+ </script> and <span class="glossary-term " style="" data-tooltip-id="tenet-n095hv1c5hp" onmouseenter="window.showTenetTooltip(event, 'tenet-n095hv1c5hp')" onmousemove="window.updateTenetTooltip(event, 'tenet-n095hv1c5hp')" onmouseleave="window.hideTenetTooltip('tenet-n095hv1c5hp')">hackabilty</span><span id="tenet-n095hv1c5hp" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
624
  if (!window.tenetTooltipInitialized) {
625
  window.tenetTooltipInitialized = true;
626
 
 
649
  }
650
  </script>.</p>
651
  <p>What’s the solution to this?</p>
652
+ <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p><strong>TL;DR:</strong> Read the code in one place, <span class="glossary-term " style="" data-tooltip-id="tenet-ycjaxpey2f" onmouseenter="window.showTenetTooltip(event, 'tenet-ycjaxpey2f')" onmousemove="window.updateTenetTooltip(event, 'tenet-ycjaxpey2f')" onmouseleave="window.hideTenetTooltip('tenet-ycjaxpey2f')">one model, one file.</span><span id="tenet-ycjaxpey2f" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
653
  if (!window.tenetTooltipInitialized) {
654
  window.tenetTooltipInitialized = true;
655
 
 
678
  }
679
  </script>. Keep semantics local (<a href="#standardize-dont-abstract">Standardize, Don’t Abstract</a>). Allow strategic duplication for end users (<a href="#do-repeat-yourself">DRY*</a>). Keep the public surface minimal and stable (<a href="#minimal-user-api">Minimal API</a>, <a href="#backwards-compatibility">Backwards Compatibility</a>, <a href="#consistent-public-surface">Consistent Surface</a>).</p><p><strong>Next:</strong> how modular transformers honor these while removing boilerplate.</p> </div> </div> </div> </div>
680
  <h2 id="-modular-transformers"><a href="#-modular-transformers"><a id="modular"></a> Modular transformers</a></h2>
681
+ <p>Transformers is an opinionated library. The previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, and the <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post</a> were already pointing at the drawbacks mentioned just above, which have been iteratively addressed. <a href="https://huggingface.co/docs/transformers/en/modular_transformers"><code>modular</code> transformers was introduced</a> to allow a form of inheritance without breaking <span class="glossary-term " style="" data-tooltip-id="tenet-ntchxzfbyd" onmouseenter="window.showTenetTooltip(event, 'tenet-ntchxzfbyd')" onmousemove="window.updateTenetTooltip(event, 'tenet-ntchxzfbyd')" onmouseleave="window.hideTenetTooltip('tenet-ntchxzfbyd')">the one model, one file rule.</span><span id="tenet-ntchxzfbyd" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
682
  if (!window.tenetTooltipInitialized) {
683
  window.tenetTooltipInitialized = true;
684
 
 
706
  };
707
  }
708
  </script></p>
709
+ <p>We amended the principle of <span class="glossary-term " style="" data-tooltip-id="tenet-xgizzl76tio" onmouseenter="window.showTenetTooltip(event, 'tenet-xgizzl76tio')" onmousemove="window.updateTenetTooltip(event, 'tenet-xgizzl76tio')" onmouseleave="window.hideTenetTooltip('tenet-xgizzl76tio')">DRY*</span><span id="tenet-xgizzl76tio" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
710
  if (!window.tenetTooltipInitialized) {
711
  window.tenetTooltipInitialized = true;
712
 
 
869
  <span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"># ... (many more classes and functions would follow)</span></span>
870
  <span class="line"></span></code></pre></div></div> </div> </div> </div> <figcaption class="reference__caption" data-astro-cid-e5g6tzce><strong>Left:</strong> Clean modular definition with inheritance. <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.</figcaption> </figure> </div>
871
  <p>As you can see, we can define a new model as a <em>modular</em> combination of fragments taken from others.</p>
872
+ <p>You might think “well that’s just how inheritance works”. The crucial difference is that we do <em>visibly</em> what is essentially the <em>compiler</em>’s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it <span class="glossary-term " style="" data-tooltip-id="tenet-8mshtouwywo" onmouseenter="window.showTenetTooltip(event, 'tenet-8mshtouwywo')" onmousemove="window.updateTenetTooltip(event, 'tenet-8mshtouwywo')" onmouseleave="window.hideTenetTooltip('tenet-8mshtouwywo')">all in one piece.</span><span id="tenet-8mshtouwywo" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
873
  if (!window.tenetTooltipInitialized) {
874
  window.tenetTooltipInitialized = true;
875
 
 
898
  }
899
  </script></p>
900
  <p>You can see below the difference between <code>GlmAttention</code> and <code>LlamaAttention</code>, with the latter having been copied with minimal changes.</p>
901
+ <div class="wide"> <div class="ri-root" data-ri-root="ri_o33bnl5zurf" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/llama_glm_attn.D7pkKjAT_1axKuC.webp" alt="Llama vs GLM" data-zoomable="1" data-astro-cid-6kov3kig width="2169" height="482" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 1:</strong> Comparison of attention implementations between Llama and GLM, showing code reuse with minimal modifications.</span> </figcaption> </figure> </div> <script>
902
  (() => {
903
  const scriptEl = document.currentScript;
904
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
1069
  <p>When <code>AutoModel.from_pretrained(...)</code> is called, it is indeed the modeling (right side) that is run, and all the tests run on the modeling code.</p>
1070
  <p>More importantly, the auto-generated modeling file is what users <em>read</em> to understand the code, what they step through in their debuggers and what they hack for their needs.</p>
1071
  <p>What does that give us?</p>
1072
+ <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p><strong>TL;DR:</strong> A small <code>modular_*.py</code> declares reuse; the expanded modeling file stays visible and <span class="glossary-term " style="" data-tooltip-id="tenet-qaah77d85on" onmouseenter="window.showTenetTooltip(event, 'tenet-qaah77d85on')" onmousemove="window.updateTenetTooltip(event, 'tenet-qaah77d85on')" onmouseleave="window.hideTenetTooltip('tenet-qaah77d85on')">unique</span><span id="tenet-qaah77d85on" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1073
  if (!window.tenetTooltipInitialized) {
1074
  window.tenetTooltipInitialized = true;
1075
 
 
1103
  <p>That gives an “effective LOC” curve: the 𝗺𝗮𝗶𝗻𝘁𝗲𝗻𝗮𝗻𝗰𝗲 𝘀𝘂𝗿𝗳𝗮𝗰𝗲.</p>
1104
  <p>Measured on git history, raw <code>modeling_*.py</code> grew at ~362 LOC/day before modular; counting only modular shards yields ~25 LOC/day after — about <strong>15× lower</strong>. The effective curve (blue line below) represents the <strong>maintenance surface</strong> today: what maintainers actually read and review.</p>
1105
  <p>Less code to hand-maintain means fewer places to break. Naturally LOC is not a direct measure of complexity, but they correlate in review effort and change risk.</p>
1106
+ <figure class="html-embed"><div class="html-embed__card"><div id="frag-kxf2qw5e6y9"><div class="d3-loc-growth"></div>
1107
  <style>
1108
  .d3-loc-growth { position: relative; }
1109
 
 
1524
  <p>But this was not the only effort that allowed us to reduce maintenance load.</p>
1525
  <p>We recently underwent a deep refactor of the attention implementation. You’ve likely heard about <a href="https://huggingface.co/docs/text-generation-inference/en/conceptual/flash_attention">flash attention</a> and its several variants.</p>
1526
  <p>The <em>attention computation</em> itself happens at a <em>lower</em> level of abstraction than the model itself.</p>
1527
+ <p>However, we were adding specific torch operations for each backend (sdpa, the several flash-attention iterations, flex attention) but it isn’t a <span class="glossary-term " style="" data-tooltip-id="tenet-xu3x0i0cbv" onmouseenter="window.showTenetTooltip(event, 'tenet-xu3x0i0cbv')" onmousemove="window.updateTenetTooltip(event, 'tenet-xu3x0i0cbv')" onmouseleave="window.hideTenetTooltip('tenet-xu3x0i0cbv')">minimal user api</span><span id="tenet-xu3x0i0cbv" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1528
  if (!window.tenetTooltipInitialized) {
1529
  window.tenetTooltipInitialized = true;
1530
 
 
1557
  <p>The solution for the “attention abstraction problem” was to move to a standard <a href="https://huggingface.co/docs/transformers/en/attention_interface">attention interface</a> that allows the following:</p>
1558
  <p>The naive implementation of attention, called “eager”, is available by default. We use a <code>Callable</code> called <code>eager_attention_forward</code>, which can run as long as the user has PyTorch installed – which is a requirement any way.</p>
1559
  <p>Instead of using a class interface and a class hierarchy, we just moved to a function interface. When a more complex attention implementation is needed, we use other Callables, including much faster kernel bindings when available. The decision to use a different attention implementation is based on the model configuration file we download from the Hub, and it can also be overridden by the user.</p>
1560
+ <p>This is a clear example that that we prefer an interface that is <span class="glossary-term " style="" data-tooltip-id="tenet-dzopnpbm3gg" onmouseenter="window.showTenetTooltip(event, 'tenet-dzopnpbm3gg')" onmousemove="window.updateTenetTooltip(event, 'tenet-dzopnpbm3gg')" onmouseleave="window.hideTenetTooltip('tenet-dzopnpbm3gg')">standard, but not abstract</span><span id="tenet-dzopnpbm3gg" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">standardize-dont-abstract</span> <span class="glossary-tooltip__definition">Model-specific logic belongs in the model file, not hidden behind abstractions.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1561
  if (!window.tenetTooltipInitialized) {
1562
  window.tenetTooltipInitialized = true;
1563
 
 
1589
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">if</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">!=</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> &quot;eager&quot;</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">:</span></span>
1590
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_interface </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ALL_ATTENTION_FUNCTIONS</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation]</span></span>
1591
  <span class="line"></span></code></pre></div>
1592
+ <p>Having the attention interfaces functionalized allows to do dynamic switching of attentions as well, increasing their <span class="glossary-term " style="" data-tooltip-id="tenet-o5jzp4153mi" onmouseenter="window.showTenetTooltip(event, 'tenet-o5jzp4153mi')" onmousemove="window.updateTenetTooltip(event, 'tenet-o5jzp4153mi')" onmouseleave="window.hideTenetTooltip('tenet-o5jzp4153mi')">hackability</span><span id="tenet-o5jzp4153mi" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">code-is-product</span> <span class="glossary-tooltip__definition">Optimize for reading, diffing, and tweaking. Code quality matters as much as functionality.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1593
  if (!window.tenetTooltipInitialized) {
1594
  window.tenetTooltipInitialized = true;
1595
 
 
1619
  </script>.
1620
  Another strength of the new attention interface is the possibility to enforce specific kwargs, which are needed by kernel providers and other dependencies.</p>
1621
  <p>Backend integrations sometimes require specific kwargs.</p>
1622
+ <p>We know that kwargs are often a necessary evil that plagues tools with widespread compatibility; and it is something we have aimed to reduce, and continue to reduce in order to improve readability - with them, the current system is a <span class="glossary-term " style="" data-tooltip-id="tenet-fmak91lcmoq" onmouseenter="window.showTenetTooltip(event, 'tenet-fmak91lcmoq')" onmousemove="window.updateTenetTooltip(event, 'tenet-fmak91lcmoq')" onmouseleave="window.hideTenetTooltip('tenet-fmak91lcmoq')">minimal user api</span><span id="tenet-fmak91lcmoq" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1623
  if (!window.tenetTooltipInitialized) {
1624
  window.tenetTooltipInitialized = true;
1625
 
 
1659
  <p>Why does it matter?</p>
1660
  <p>Because we want to avoid code modifications that are unrelated to the model.</p>
1661
  <p>We choose to place the level of abstraction higher than the device placement: a matrix multiplication - a <code>nn.Linear</code> layer - should be always expressed in the same way, regardless of how it is placed.</p>
1662
+ <p>Hence, we want to touch the modeling code <span class="glossary-term " style="" data-tooltip-id="tenet-47rl7l9zgyy" onmouseenter="window.showTenetTooltip(event, 'tenet-47rl7l9zgyy')" onmousemove="window.updateTenetTooltip(event, 'tenet-47rl7l9zgyy')" onmouseleave="window.hideTenetTooltip('tenet-47rl7l9zgyy')">as little as possible</span><span id="tenet-47rl7l9zgyy" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1663
  if (!window.tenetTooltipInitialized) {
1664
  window.tenetTooltipInitialized = true;
1665
 
 
1736
  <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> &quot;full_attention&quot;</span></span>
1737
  <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> ],</span></span>
1738
  <span class="line"></span></code></pre></div>
1739
+ <p>This is <span class="glossary-term " style="" data-tooltip-id="tenet-jdraxgrip" onmouseenter="window.showTenetTooltip(event, 'tenet-jdraxgrip')" onmousemove="window.updateTenetTooltip(event, 'tenet-jdraxgrip')" onmouseleave="window.hideTenetTooltip('tenet-jdraxgrip')">minimal</span><span id="tenet-jdraxgrip" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1740
  if (!window.tenetTooltipInitialized) {
1741
  window.tenetTooltipInitialized = true;
1742
 
 
1766
  </script> to implement on the user side, and allows to keep the modeling code untouched. It is also easy to tweak.</p>
1767
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Allowed layer types are explicit; schedules (e.g., sliding/full alternation) live in config. This keeps the file readable and easy to tweak.</p><p><strong>Next:</strong> speedups come from kernels that don’t change semantics.</p> </div> </div> </div> </div>
1768
  <h3 id="community-kernels"><a href="#community-kernels"><a id="community-kernels"></a>Community Kernels</a></h3>
1769
+ <p>The same principle extends to normalization, activation, and other code paths. The model defines <strong>semantics</strong>; a kernel defines <strong>how</strong> to execute them faster. We annotate the module to borrow a community‑provided forward, keeping a <span class="glossary-term " style="" data-tooltip-id="tenet-333mat5ehxd" onmouseenter="window.showTenetTooltip(event, 'tenet-333mat5ehxd')" onmousemove="window.updateTenetTooltip(event, 'tenet-333mat5ehxd')" onmouseleave="window.hideTenetTooltip('tenet-333mat5ehxd')">consistent public surface</span><span id="tenet-333mat5ehxd" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">consistent-public-surface</span> <span class="glossary-tooltip__definition">Uniform naming, signatures, and conventions across all models for predictability.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
1770
  if (!window.tenetTooltipInitialized) {
1771
  window.tenetTooltipInitialized = true;
1772
 
 
1814
  <p>So what do we see?</p>
1815
  <p>(Graph reading guide: nodes are models; edges are modular imports).</p>
1816
  <p>Check out the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer here</a> (tab “dependency graph���, hit “build graph”) for better manipulation and exploration.</p>
1817
+ <div class="wide"> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-aj72jxbpyp"><iframe
1818
  src="https://molbap-dependencies-1.hf.space"
1819
  style="width:100%; height:680px; border:0"
1820
  allow="clipboard-read; clipboard-write; fullscreen"
 
1822
  ></iframe></div></div></figure> </div>
1823
  <p>Let’s walk through some sections of this graph together.
1824
  First, Llama is a basis and an influence for many models, and it is very visible.</p>
1825
+ <div class="ri-root" data-ri-root="ri_wrem28w1g1g" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/llama_center.CbQ5MyAc_ZraNCd.webp" alt="Llama in the center" data-zoomable="1" data-astro-cid-6kov3kig width="1030" height="1015" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 2:</strong> Llama as a central model influencing many other models in the dependency graph.</span> </figcaption> </figure> </div> <script>
1826
  (() => {
1827
  const scriptEl = document.currentScript;
1828
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
1991
  </script>
1992
  <p>The models linked sometimes pull components from other models than <code>llama</code> of course. Radically different architectures such as mamba have spawned their own dependency subgraph.</p>
1993
  <p>Audio models form sparser archipelagos, see for instance wav2vec2 which is a significant basis for a dozen of them.</p>
1994
+ <div class="ri-root" data-ri-root="ri_0rm5k67qpkw" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/cluster_wave2vec2.BvHBUP61_Z2vdkmW.webp" alt="Wav2vec2 influence" data-zoomable="1" data-astro-cid-6kov3kig width="608" height="563" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 3:</strong> Cluster of audio architectures based on wav2vec2, forming a specialized archipelago.</span> </figcaption> </figure> </div> <script>
1995
  (() => {
1996
  const scriptEl = document.currentScript;
1997
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
2160
  </script>
2161
  <p>In the case of VLMs which have massively grown in popularity since 2024, there’s far too many vision-based architectures that are not yet defined as modulars of other existing archs. In other words, there is no strong reference point in terms of software for vision models.</p>
2162
  <p>As you can see, there is a small <code>DETR</code> island:</p>
2163
+ <div class="ri-root" data-ri-root="ri_nql7fiutb4" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/detr_island.CSrqELWy_1IAFDR.webp" alt="DETR archipelago" data-zoomable="1" data-astro-cid-6kov3kig width="591" height="606" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 4:</strong> Small DETR archipelago for vision models, less centralized than Llama for text.</span> </figcaption> </figure> </div> <script>
2164
  (() => {
2165
  const scriptEl = document.currentScript;
2166
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
2329
  </script>
2330
  <p>There is also a little llava pocket, and so on, but it’s not comparable to the centrality observed for llama.</p>
2331
  <p>Another problem is, this visualization only shows <code>modular</code> models. Several models still do NOT have a modular file. If we zoom out significantly, we can see them, the red nodes are models that do not have a modular file yet.</p>
2332
+ <div class="ri-root" data-ri-root="ri_cz2wj877qo5" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/big_picture_zoomout.BKwXtSkj_1bNS6U.webp" alt="Red nodes" data-zoomable="1" data-astro-cid-6kov3kig width="1043" height="972" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 5:</strong> Overview showing red nodes (models without modular files) to be modularized.</span> </figcaption> </figure> </div> <script>
2333
  (() => {
2334
  const scriptEl = document.currentScript;
2335
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
2503
  <p>Next, I looked into Jaccard similarity, which we use to measure set differences. I know that code is more than a set of characters stringed together. I also used code embedding models to check out code similarities, and it yielded better results, for the needs of this blog post I will stick to Jaccard index.</p>
2504
  <p>It is interesting, for that, to look at <em>when</em> we deployed this modular logic and what was its rippling effect on the library. You can check the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">larger space</a> to play around, but the gist is: adding modular allowed to connect more and more models to solid reference points. We have a lot of gaps to fill in still.</p>
2505
  <p>Zoom out below - it’s full of models. You can click on a node to see its connections better, or use the text box to search for a model. You can use the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer</a> (tab “timeline”, hit “build timeline”) for better exploration.</p>
2506
+ <div class="wide"> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-fn1oqusg0km"> <iframe
2507
  src="https://molbap-timeline-1.hf.space"
2508
  style="width:100%; height:680px; border:0"
2509
  allow="clipboard-read; clipboard-write; fullscreen"
2510
  referrerpolicy="no-referrer-when-downgrade"
2511
  ></iframe></div></div></figure> </div>
2512
  <p>Let’s look at a few highly connected models. Let’s start by the foundational work of <a href="https://arxiv.org/abs/2304.08485">Llava</a>.</p>
2513
+ <div class="ri-root" data-ri-root="ri_gwvyqlf97" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/timeline_llava.Bne5RSo9_Z26WEKX.webp" alt="Llava in its timeline" data-zoomable="1" data-astro-cid-6kov3kig width="1250" height="770" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 6:</strong> LLaVA and its variants in the timeline, with llava_video as a candidate for modularization.</span> </figcaption> </figure> </div> <script>
2514
  (() => {
2515
  const scriptEl = document.currentScript;
2516
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
2677
  else window.addEventListener("load", initZoomIfNeeded, { once: true });
2678
  })();
2679
  </script>
2680
+ <p>You see that <code>llava_video</code> is a red node, connected by a red edge to <code>llava</code>: it’s a candidate, something that we can <em>likely</em> remodularize, <span class="glossary-term " style="" data-tooltip-id="tenet-ciflpit9e6t" onmouseenter="window.showTenetTooltip(event, 'tenet-ciflpit9e6t')" onmousemove="window.updateTenetTooltip(event, 'tenet-ciflpit9e6t')" onmouseleave="window.hideTenetTooltip('tenet-ciflpit9e6t')">not touching the actual model</span><span id="tenet-ciflpit9e6t" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">backwards-compatibility</span> <span class="glossary-tooltip__definition">Any artifact once on the hub must remain loadable. Breaking changes are unacceptable.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
2681
  if (!window.tenetTooltipInitialized) {
2682
  window.tenetTooltipInitialized = true;
2683
 
 
2704
  tooltip.style.opacity = '0';
2705
  };
2706
  }
2707
+ </script> but being much more readable with <span class="glossary-term " style="" data-tooltip-id="tenet-cq7thg0mt2" onmouseenter="window.showTenetTooltip(event, 'tenet-cq7thg0mt2')" onmousemove="window.updateTenetTooltip(event, 'tenet-cq7thg0mt2')" onmouseleave="window.hideTenetTooltip('tenet-cq7thg0mt2')">DRY*</span><span id="tenet-cq7thg0mt2" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
2708
  if (!window.tenetTooltipInitialized) {
2709
  window.tenetTooltipInitialized = true;
2710
 
 
2734
  </script>.</p>
2735
  <p>The same can be identified with the classical encoders family, centered on <code>BERT</code>:</p>
2736
  <p>Here <code>roberta</code>, <code>xlm_roberta</code>, <code>ernie</code> are <code>modular</code>s of BERT, while models like <code>mobilebert</code> are likely candidates.</p>
2737
+ <div class="ri-root" data-ri-root="ri_qoecsiezts" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/classic_encoders.BSgQl9lp_3OtlT.webp" alt="Classical encoders" data-zoomable="1" data-astro-cid-6kov3kig width="1274" height="749" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 7:</strong> Family of classical encoders centered on BERT, with several models already modularized.</span> </figcaption> </figure> </div> <script>
2738
  (() => {
2739
  const scriptEl = document.currentScript;
2740
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
2908
  <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> InputsEmbeddingMixerMixin</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span>
2909
  <span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"> #</span></span>
2910
  <span class="line"></span></code></pre></div>
2911
+ <p>But this is <span class="glossary-term " style="" data-tooltip-id="tenet-4fvgoptl7mt" onmouseenter="window.showTenetTooltip(event, 'tenet-4fvgoptl7mt')" onmousemove="window.updateTenetTooltip(event, 'tenet-4fvgoptl7mt')" onmouseleave="window.hideTenetTooltip('tenet-4fvgoptl7mt')">not an abstraction</span><span id="tenet-4fvgoptl7mt" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">standardize-dont-abstract</span> <span class="glossary-tooltip__definition">Model-specific logic belongs in the model file, not hidden behind abstractions.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
2912
  if (!window.tenetTooltipInitialized) {
2913
  window.tenetTooltipInitialized = true;
2914
 
 
2938
  </script>. Embedding mixin is part of the model, removing it would break it. A user opening <a href="https://github.com/huggingface/transformers/blob/b3bd815786c36f4e6c3791fae0a96cac86658b32/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1358"><code>modeling_qwen2.5_vl</code></a> (check out the <a href="https://huggingface.co/collections/Qwen/qwen25-vl-6795ffac22b334a837c0f9a5">Qwen2.5VL collection</a>) should not have to go to another file to understand how it works.</p>
2939
  <p>What is the current state of these “abstractions” across the codebase?
2940
  You will see all the imports around a modeling file, here <a href="https://huggingface.co/google/gemma-3n-E4B-it">Gemma3n</a>.</p>
2941
+ <div class="ri-root" data-ri-root="ri_ajm7a0whqu" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/still_graph_bloat.BII6Am4a_Z2d5KVT.webp" alt="Gemma3n graph" data-zoomable="1" data-astro-cid-6kov3kig width="2580" height="1207" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 8:</strong> Gemma3n import graph showing dependency complexity, with GenerationMixin very central.</span> </figcaption> </figure> </div> <script>
2942
  (() => {
2943
  const scriptEl = document.currentScript;
2944
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
3148
  <span class="line"></span>
3149
  <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask, special_video_mask</span></span>
3150
  <span class="line"></span></code></pre></div>
3151
+ <p>But this is <em>within</em> the modeling file, not in the <code>PreTrainedModel</code> base class. It does not move away from it, because it’d break the <span class="glossary-term " style="" data-tooltip-id="tenet-n1g7f91fx9" onmouseenter="window.showTenetTooltip(event, 'tenet-n1g7f91fx9')" onmousemove="window.updateTenetTooltip(event, 'tenet-n1g7f91fx9')" onmouseleave="window.hideTenetTooltip('tenet-n1g7f91fx9')">One model, one file tenet.</span><span id="tenet-n1g7f91fx9" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
3152
  if (!window.tenetTooltipInitialized) {
3153
  window.tenetTooltipInitialized = true;
3154
 
 
3182
  <h3 id="on-image-processing-and-processors"><a href="#on-image-processing-and-processors">On image processing and processors</a></h3>
3183
  <p>Deciding to become a <code>torch</code>-first library meant relieving a tremendous amount of support for <code>jax </code> and <code>TensorFlow</code>, and it also meant that we could be more lenient about the amount of torch-dependent utilities that we were able to accept. One of these is the <em>fast processing</em> of images. Where inputs were once minimally assumed to be ndarrays, enforcing native <code>torch</code> and <code>torchvision</code> inputs allowed us to massively improve processing speed for each model.</p>
3184
  <p>The gains in performance are immense, up to 20x speedup for most models when using compiled torchvision ops. Furthermore, let us run the whole pipeline solely on GPU.</p>
3185
+ <div class="ri-root" data-ri-root="ri_um27coemkts" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/fast_image_processors.D3x5vY3o_2cacGa.webp" alt="Fast Image Processors Performance" data-zoomable="1" data-astro-cid-6kov3kig width="2251" height="2409" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 9:</strong> Performance gains of fast image processors, up to 20x acceleration with compiled torchvision.</span> </figcaption> </figure> </div> <script>
3186
  (() => {
3187
  const scriptEl = document.currentScript;
3188
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
3358
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>The shape of a contribution: add a model (or variant) with a small modular shard; the community and serving stacks pick it up immediately. Popularity trends (encoders/embeddings) guide where we invest.</p><p><strong>Next:</strong> power tools enabled by a consistent API.</p> </div> </div> </div> </div>
3359
  <h3 id="-models-popularity"><a href="#-models-popularity"><a id="encoders-ftw"></a> Models popularity</a></h3>
3360
  <p>Talking about dependencies, we can take a look at the number of downloads as a measure of popularity. One thing we see is the prominence of encoders, despite the apparent prevalence of decoder LLMs. The reason is that encoders are used to generate embeddings, which have multiple downstream uses. Just check out <a href="https://huggingface.co/blog/embeddinggemma">EmbeddingGemma</a> for a modern recap. Hence, it is vital to keep the encoders portion of the library viable, usable, fine-tunable.</p>
3361
+ <div><figure class="html-embed"><div class="html-embed__card"><div id="frag-1pwez2fz3iv"><div class="d3-model-popularity"></div>
3362
  <style>
3363
  .d3-model-popularity .controls {
3364
  margin-top: 0;
 
3934
  </script>
3935
  </div></div></figure></div>
3936
  <p>As the codebase grows, we need to maintain it in coordination with our friend <a href="https://huggingface.co/sentence-transformers">Sentence Transformers codebase</a>. Retrieval use-cases, smart databases, FAISS-based indexing rely on it, and thus indirectly on transformers.</p>
3937
+ <p>In that regard, we DO want to be a modular toolbox, being <span class="glossary-term " style="" data-tooltip-id="tenet-0xm1hw2rxwnq" onmouseenter="window.showTenetTooltip(event, 'tenet-0xm1hw2rxwnq')" onmousemove="window.updateTenetTooltip(event, 'tenet-0xm1hw2rxwnq')" onmouseleave="window.hideTenetTooltip('tenet-0xm1hw2rxwnq')">minimal</span><span id="tenet-0xm1hw2rxwnq" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
3938
  if (!window.tenetTooltipInitialized) {
3939
  window.tenetTooltipInitialized = true;
3940
 
 
3970
  <p>All models have the same API for attention computation, thanks to <a href="#external-attention-classes">the externalisation of attention classes</a>.</p>
3971
  <p>This uniformity allows us to build cool tools to visualize the inner workings of the attention mechanism.</p>
3972
  <p>One particular piece of machinery is the <code>attention mask</code>. Here you see the famous bidirectional attention pattern for the whole prefix (text + image) in PaliGemma and all Gemma2+ models, contrasting with the usual “causal-only” models.</p>
3973
+ <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-0ybps89976p"><!-- Minimal HTML fragment: terminal-style ASCII attention masks -->
3974
  <div style="max-width: 940px; margin: 16px 0; border:1px solid #2a2f3a; border-radius:8px; background:#0b0f19; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; color:#e5e7eb;">
3975
  <div style="display:flex; align-items:center; gap:8px; padding:8px 10px; border-bottom:1px solid #1f2430; background:#111827; border-top-left-radius:8px; border-top-right-radius:8px;">
3976
  <span style="width:10px; height:10px; background:#ef4444; border-radius:50%; display:inline-block;"></span>
 
4018
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Uniform attention APIs enable cross-model diagnostics (e.g., PaliGemma prefix bidirectionality vs causal).</p><p><strong>Next:</strong> whole-model tracing for ports and regressions.</p> </div> </div> </div> </div>
4019
  <h3 id="logging-entire-model-activations"><a href="#logging-entire-model-activations">Logging entire model activations</a></h3>
4020
  <p>Because everything is PyTorch, we can easily <a href="https://huggingface.co/docs/transformers/internal/model_debugging_utils">debug any model</a> when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.</p>
4021
+ <p>It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, to match our <span class="glossary-term " style="" data-tooltip-id="tenet-tlzo5jh5sq" onmouseenter="window.showTenetTooltip(event, 'tenet-tlzo5jh5sq')" onmousemove="window.updateTenetTooltip(event, 'tenet-tlzo5jh5sq')" onmouseleave="window.hideTenetTooltip('tenet-tlzo5jh5sq')">Source of Truth guideline</span><span id="tenet-tlzo5jh5sq" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">source-of-truth</span> <span class="glossary-tooltip__definition">Model implementations should be reliable, reproducible, and faithful to original performances.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
4022
  if (!window.tenetTooltipInitialized) {
4023
  window.tenetTooltipInitialized = true;
4024
 
 
4046
  };
4047
  }
4048
  </script>.</p>
4049
+ <div class="wide"> <div class="ri-root" data-ri-root="ri_gsvktwm2wk" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/model_debugger.DouWEpKv_Z2338YE.webp" alt="Model debugger interface" data-zoomable="1" data-astro-cid-6kov3kig width="2053" height="1016" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 10:</strong> Model debugger interface intercepting calls and logging statistics in nested JSON.</span> </figcaption> </figure> </div> <script>
4050
  (() => {
4051
  const scriptEl = document.currentScript;
4052
  const root = scriptEl ? scriptEl.previousElementSibling : null;
 
4215
  </script> </div>
4216
  <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Forward interception and nested JSON logging align ports to reference implementations, reinforcing “Source of Truth.”</p><p><strong>Next:</strong> CUDA warmup reduces load-time without touching modeling semantics.</p> </div> </div> </div> </div>
4217
  <h3 id="cooking-faster-cuda-warmups"><a href="#cooking-faster-cuda-warmups">Cooking faster CUDA warmups</a></h3>
4218
+ <p>Having a clean <em>external</em> API allows us to work on the <span class="glossary-term " style="" data-tooltip-id="tenet-xjsnbcluhk" onmouseenter="window.showTenetTooltip(event, 'tenet-xjsnbcluhk')" onmousemove="window.updateTenetTooltip(event, 'tenet-xjsnbcluhk')" onmouseleave="window.hideTenetTooltip('tenet-xjsnbcluhk')">true inner workings of transformers</span><span id="tenet-xjsnbcluhk" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">code-is-product</span> <span class="glossary-tooltip__definition">Optimize for reading, diffing, and tweaking. Code quality matters as much as functionality.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
4219
  if (!window.tenetTooltipInitialized) {
4220
  window.tenetTooltipInitialized = true;
4221
 
 
4243
  };
4244
  }
4245
  </script>. One of a few recent additions is the <em>CUDA warmup</em> via <code>caching_allocator_warmup</code>, which dramatically improves loading times by pre-allocating GPU memory to avoid malloc bottlenecks during model loading. It can achieve a 7x speedup factor for an 8B model, or 6x for a 32B one, as you can check in <a href="https://github.com/huggingface/transformers/pull/36380">the PR</a>!</p>
4246
+ <div class="wide"> <figure class="html-embed"><figcaption class="html-embed__title" style="text-align:center">Mem allocation patterns during model loading</figcaption><div class="html-embed__card"><div id="frag-1yqwtwkgsuq"><div class="d3-warmup-demo"></div>
4247
  <style>
4248
  .d3-warmup-demo {
4249
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 
4896
  <li>having it immediately usable in vLLM, <a href="https://huggingface.co/blog/transformers-backend-sglang">SGLang</a>, and so on without additional code. In the case of vLLM, transformers was added as a backend to run models on vLLM, which optimizes throughput/latency on top of <em>existing</em> transformers architectures <a href="https://blog.vllm.ai/2025/04/11/transformers-backend.html">as seen in this great vLLM x HF blog post.</a></li>
4897
  <li>being the reference code for implementations in MLX, llama.cpp and other libraries.</li>
4898
  </ul>
4899
+ <p>This further cements the need for a <span class="glossary-term " style="" data-tooltip-id="tenet-d3yt5vtms4n" onmouseenter="window.showTenetTooltip(event, 'tenet-d3yt5vtms4n')" onmousemove="window.updateTenetTooltip(event, 'tenet-d3yt5vtms4n')" onmouseleave="window.hideTenetTooltip('tenet-d3yt5vtms4n')">consistent public surface</span><span id="tenet-d3yt5vtms4n" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">consistent-public-surface</span> <span class="glossary-tooltip__definition">Uniform naming, signatures, and conventions across all models for predictability.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script>
4900
  if (!window.tenetTooltipInitialized) {
4901
  window.tenetTooltipInitialized = true;
4902
 
app/dist/index.html.gz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f9ffb5f68f565966a29b6e288c1275e5369a196c68375d616de1b916457561d
3
- size 64022
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9420f11a9bf421d29bc1a3da6e38aaad5a7daf6c3b263847ef4b9e4db66fb8e7
3
+ size 64025
app/scripts/export-pdf.mjs CHANGED
@@ -154,6 +154,49 @@ async function waitForStableLayout(page, timeoutMs = 5000) {
154
  }
155
  }
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  async function main() {
158
  const cwd = process.cwd();
159
  const port = Number(process.env.PREVIEW_PORT || 8080);
@@ -246,6 +289,7 @@ async function main() {
246
  await waitForPlotly(page);
247
  }
248
  if (wait === 'full') {
 
249
  await waitForStableLayout(page);
250
  }
251
  await page.emulateMedia({ media: 'print' });
@@ -404,6 +448,19 @@ async function main() {
404
  });
405
  } catch {}
406
  } catch {}
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  // Temporarily enforce print-safe responsive sizing (SVG/iframes) and improve banner visibility
408
  let pdfCssHandle = null;
409
  try {
@@ -413,6 +470,12 @@ async function main() {
413
 
414
  /* Make all vector/bitmap media responsive for print */
415
  svg, canvas, img, video { max-width: 100% !important; height: auto !important; }
 
 
 
 
 
 
416
  /* Mermaid diagrams */
417
  .mermaid, .mermaid svg { display: block; width: 100% !important; max-width: 100% !important; height: auto !important; }
418
  /* Any explicit width attributes */
 
154
  }
155
  }
156
 
157
+ async function waitForTOC(page, timeoutMs = 10000) {
158
+ await page.evaluate(async (timeout) => {
159
+ const start = Date.now();
160
+ while (Date.now() - start < timeout) {
161
+ const placeholder = document.getElementById('article-toc-placeholder');
162
+ if (placeholder && placeholder.querySelector('nav ul li')) {
163
+ return;
164
+ }
165
+ await new Promise(r => setTimeout(r, 100));
166
+ }
167
+ }, timeoutMs);
168
+ }
169
+
170
+ async function convertIframesToImages(page) {
171
+ await page.evaluate(async () => {
172
+ const iframes = Array.from(document.querySelectorAll('iframe'));
173
+ for (const iframe of iframes) {
174
+ try {
175
+ const rect = iframe.getBoundingClientRect();
176
+ if (rect.width > 0 && rect.height > 0) {
177
+ const placeholder = document.createElement('div');
178
+ placeholder.style.cssText = `
179
+ width: ${rect.width}px;
180
+ height: ${rect.height}px;
181
+ background: #f5f5f5;
182
+ border: 1px solid #ddd;
183
+ display: flex;
184
+ align-items: center;
185
+ justify-content: center;
186
+ font-family: system-ui, sans-serif;
187
+ color: #666;
188
+ font-size: 14px;
189
+ `;
190
+ placeholder.textContent = '[Interactive content - view online]';
191
+ iframe.replaceWith(placeholder);
192
+ }
193
+ } catch (e) {
194
+ console.warn('Could not process iframe:', e);
195
+ }
196
+ }
197
+ });
198
+ }
199
+
200
  async function main() {
201
  const cwd = process.cwd();
202
  const port = Number(process.env.PREVIEW_PORT || 8080);
 
289
  await waitForPlotly(page);
290
  }
291
  if (wait === 'full') {
292
+ await waitForTOC(page);
293
  await waitForStableLayout(page);
294
  }
295
  await page.emulateMedia({ media: 'print' });
 
448
  });
449
  } catch {}
450
  } catch {}
451
+
452
+ // Convert iframes to placeholders for PDF
453
+ await convertIframesToImages(page);
454
+ await page.waitForTimeout(500);
455
+
456
+ // Ensure all images are loaded with background/content
457
+ await page.evaluate(() => {
458
+ document.querySelectorAll('img').forEach(img => {
459
+ if (img.loading === 'lazy') img.loading = 'eager';
460
+ });
461
+ });
462
+ await waitForImages(page, 10000);
463
+
464
  // Temporarily enforce print-safe responsive sizing (SVG/iframes) and improve banner visibility
465
  let pdfCssHandle = null;
466
  try {
 
470
 
471
  /* Make all vector/bitmap media responsive for print */
472
  svg, canvas, img, video { max-width: 100% !important; height: auto !important; }
473
+ /* Ensure images are visible in PDF */
474
+ img {
475
+ -webkit-print-color-adjust: exact !important;
476
+ print-color-adjust: exact !important;
477
+ display: block !important;
478
+ }
479
  /* Mermaid diagrams */
480
  .mermaid, .mermaid svg { display: block; width: 100% !important; max-width: 100% !important; height: auto !important; }
481
  /* Any explicit width attributes */
app/src/styles/_print.css CHANGED
@@ -34,8 +34,16 @@
34
 
35
  /* Force single column to reduce widows/orphans and awkward breaks */
36
  .content-grid { grid-template-columns: 1fr !important; }
37
- .table-of-contents, .right-aside, .table-of-contents-mobile { display: none !important; }
38
  main > nav:first-of-type { display: none !important; }
 
 
 
 
 
 
 
 
39
 
40
  /* Avoid page breaks inside complex visual blocks */
41
  .hero,
 
34
 
35
  /* Force single column to reduce widows/orphans and awkward breaks */
36
  .content-grid { grid-template-columns: 1fr !important; }
37
+ .right-aside { display: none !important; }
38
  main > nav:first-of-type { display: none !important; }
39
+
40
+ /* Show ToC in PDF */
41
+ .table-of-contents, .table-of-contents-mobile {
42
+ display: block !important;
43
+ break-inside: avoid;
44
+ page-break-inside: avoid;
45
+ margin-bottom: 2rem;
46
+ }
47
 
48
  /* Avoid page breaks inside complex visual blocks */
49
  .hero,