Spaces:

Fraser
/

piclets

Running

App Files Files Community

Fraser commited on Aug 15

Commit

f5cddb9

1 Parent(s): c7f2c59

more LLM

Browse files

Files changed (1) hide show

prototype_web_chat.html +93 -31

prototype_web_chat.html CHANGED Viewed

@@ -13,7 +13,7 @@
     header .pill { font-size:12px; color:var(--bg); background:var(--accent); padding:.2rem .55rem; border-radius:999px; font-weight:700; letter-spacing:.02em; }
     main { display:grid; grid-template-rows:auto 1fr auto; height:calc(100dvh - 58px); }
     .bar { display:flex; flex-wrap:wrap; gap:8px; padding:10px 12px; background:#0f1216; border-bottom:1px solid #21262c; align-items:center; }
-    select, input[type="number"] { background:var(--card); color:var(--text); border:1px solid #29313a; border-radius:10px; padding:8px 10px; }
     button { background:#1c2128; color:var(--text); border:1px solid #2a323c; border-radius:12px; padding:10px 12px; font-weight:600; cursor:pointer; }
     button.primary { background:var(--accent); color:#08261b; border:none; }
     button.ghost { background:transparent; border-color:#2a323c; }
@@ -35,39 +35,65 @@
     .row { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
     .spacer { flex:1; }
     a { color:#93c5fd; }
   </style>
 </head>
 <body>
   <header>
     <h1>Browser LLM</h1>
-    <span class="pill">WASM • CPU-only</span>
     <span id="isoNote" class="tiny"></span>
   </header>
   <main>
     <div class="bar">
       <label>Model:</label>
-      <select id="model">
-        <!-- Public Hugging Face GGUFs (no hosting needed) -->
-        <option value='{"id":"QuantFactory/SmolLM2-360M-GGUF","file":"SmolLM2-360M.Q4_0.gguf","label":"SmolLM2-360M Q4_0 (≈229 MB)"}'>
-          SmolLM2-360M Q4_0 (≈229 MB)
-        </option>
-        <option value='{"id":"QuantFactory/SmolLM2-360M-GGUF","file":"SmolLM2-360M.Q3_K_S.gguf","label":"SmolLM2-360M Q3_K_S (≈219 MB, faster)"}'>
-          SmolLM2-360M Q3_K_S (≈219 MB, faster)
-        </option>
-        <option value='{"id":"QuantFactory/SmolLM2-360M-GGUF","file":"SmolLM2-360M.Q2_K.gguf","label":"SmolLM2-360M Q2_K (≈~200 MB, min RAM / quality drop)"}'>
-          SmolLM2-360M Q2_K (≈~200 MB, min RAM / quality drop)
-        </option>
       </select>
       <div class="row">
         <label>Max new tokens</label>
         <input id="nPredict" type="number" min="1" max="512" step="1" value="128" />
       </div>
       <div class="row">
         <label>Temp</label><input id="temp" type="number" min="0" max="2" step="0.1" value="0.7" style="width:80px" />
-        <label>Top-p</label><input id="topp" type="number" min="0" max="1" step="0.05" value="0.9" style="width:80px" />
-        <label>Top-k</label><input id="topk" type="number" min="1" max="100" step="1" value="40" style="width:80px" />
       </div>
       <div class="spacer"></div>
@@ -113,6 +139,9 @@
     const $send  = document.getElementById('sendBtn');
     const $stop  = document.getElementById('stopBtn');
     const $iso   = document.getElementById('isoNote');
     // ——— State ———
     const decoder = new TextDecoder();
@@ -122,13 +151,13 @@
     let eotToken = -1;
     let sysPrompt = "You are a helpful, concise assistant. Keep answers short and clear.";
-    // Keep RAM low for mobile: small context + int4 KV cache
     const LOAD_CONFIG = {
       n_ctx: 768,
       n_batch: 48,
-      cache_type_k: "q4_0",
-      cache_type_v: "f16",    // <- WASM-safe (do NOT set q4_0 here)
-      flash_attn: false,      // WASM: flash attention unavailable
       progressCallback: ({ loaded, total }) => {
         const pct = (total && total > 0) ? Math.round(loaded / total * 100) : 0;
         $prog.style.width = pct + '%';
@@ -156,9 +185,9 @@
     function noteIsolation() {
       if (!crossOriginIsolated) {
-        $iso.innerHTML = 'Single-thread mode (serve with COOP/COEP for multithread)';
       } else {
-        $iso.textContent = 'Cross-origin isolated: multithread on';
       }
     }
     noteIsolation();
@@ -172,18 +201,34 @@
       messages.length = 0; messages.push(...kept);
     }
     async function ensureLoaded() {
       if (loaded) return;
       $prog.style.width = '0%';
-      const choice = JSON.parse($model.value);
-      ui.setStats('Fetching model…');
-      await wllama.loadModelFromHF(choice.id, choice.file, LOAD_CONFIG);
       loaded = true;
       eotToken = wllama.getEOT();
       const meta = await wllama.getModelMetadata();
       const ctx = wllama.getLoadedContextInfo();
       const thr = wllama.getNumThreads?.() ?? 1;
-      ui.setStats(`Loaded ${choice.file} • ${meta.n_params?.toLocaleString?.() || '≈360M'} params • ctx ${ctx.n_ctx} • threads ${thr}`);
       $load.disabled = true; $unload.disabled = false;
     }
@@ -200,6 +245,11 @@
     document.getElementById('unloadBtn').addEventListener('click', unloadModel);
     document.getElementById('stopBtn').addEventListener('click', () => aborter?.abort());
     $form.addEventListener('submit', async (ev) => {
       ev.preventDefault();
       const text = ($input.value || '').trim();
@@ -258,7 +308,7 @@
       }
     });
-    // Enter-to-send on mobile; Shift+Enter for newline
     $input.addEventListener('keydown', (e) => {
       if (e.key === 'Enter' && !e.shiftKey) {
         e.preventDefault();
@@ -268,11 +318,23 @@
   </script>
   <!--
-    Notes:
-    • Fixed the 404 by importing explicit ESM entry and pointing to the exact wasm files on jsDelivr.
-    • Runs entirely on CPU via WebAssembly (no WebGPU). Works in single-thread everywhere; for multithread,
-      serve with COOP/COEP headers so SharedArrayBuffer is available.
-    • For even lower RAM or faster sampling, pick Q3_K_S or Q2_K in the dropdown, and keep n_ctx modest.
   -->
 </body>
 </html>

     header .pill { font-size:12px; color:var(--bg); background:var(--accent); padding:.2rem .55rem; border-radius:999px; font-weight:700; letter-spacing:.02em; }
     main { display:grid; grid-template-rows:auto 1fr auto; height:calc(100dvh - 58px); }
     .bar { display:flex; flex-wrap:wrap; gap:8px; padding:10px 12px; background:#0f1216; border-bottom:1px solid #21262c; align-items:center; }
+    select, input[type="number"], input[type="text"] { background:var(--card); color:var(--text); border:1px solid #29313a; border-radius:10px; padding:8px 10px; }
     button { background:#1c2128; color:var(--text); border:1px solid #2a323c; border-radius:12px; padding:10px 12px; font-weight:600; cursor:pointer; }
     button.primary { background:var(--accent); color:#08261b; border:none; }
     button.ghost { background:transparent; border-color:#2a323c; }
     .row { display:flex; gap:8px; align-items:center; flex-wrap:wrap; }
     .spacer { flex:1; }
     a { color:#93c5fd; }
+    details { margin-left:8px; }
+    .note { font-size:12px; color:var(--muted); max-width:720px; }
   </style>
 </head>
 <body>
   <header>
     <h1>Browser LLM</h1>
+    <span class="pill">WASM • CPU‑only</span>
     <span id="isoNote" class="tiny"></span>
   </header>
   <main>
     <div class="bar">
       <label>Model:</label>
+      $1        <!-- NEW: Gemma-3-270M from ggml-org (public GGUF) -->
+        <option value='{"id":"ggml-org/gemma-3-270m-GGUF","file":"gemma-3-270m-Q8_0.gguf","label":"Gemma‑3‑270M Q8_0 (≈292 MB)"}'>Gemma‑3‑270M Q8_0 (≈292 MB)</option>
+        <!-- Smallest RAM / fastest (good for phones) -->
+        <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q3_K_S.gguf","label":"OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)"}'>OpenELM‑270M‑Instruct Q3_K_S (≈134 MB)</option>
+        <option value='{"id":"mradermacher/OpenELM-270M-Instruct-GGUF","file":"OpenELM-270M-Instruct.Q4_K_M.gguf","label":"OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)"}'>OpenELM‑270M‑Instruct Q4_K_M (≈175 MB)</option>
+        <!-- Good quality while still small -->
+        <option value='{"id":"mav23/SmolLM-135M-Instruct-GGUF","file":"smollm-135m-instruct.Q3_K_S.gguf","label":"SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)"}'>SmolLM‑135M‑Instruct Q3_K_S (≈88 MB)</option>
+        <option value='{"id":"QuantFactory/SmolLM-360M-Instruct-GGUF","file":"SmolLM-360M-Instruct.Q3_K_S.gguf","label":"SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)"}'>SmolLM‑360M‑Instruct Q3_K_S (≈219 MB)</option>
+        <!-- Stronger tiny model (bigger, still phone‑possible on high‑end) -->
+        <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q3_k_m.gguf","label":"Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)"}'>Qwen2.5‑0.5B‑Instruct Q3_K_M (≈432 MB)</option>
+        <option value='{"id":"Qwen/Qwen2.5-0.5B-Instruct-GGUF","file":"qwen2.5-0.5b-instruct-q4_k_m.gguf","label":"Qwen2.5‑0.5B‑Instruct Q4_K_M (≈491 MB)"}'>Qwen2.5‑0.5B‑Instruct Q4_K_M (≈491 MB)</option>
+        <!-- Optional: bigger but better; may be too heavy for some phones -->
+        <option value='{"id":"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF","file":"tinyllama-1.1b-chat-v1.0.Q3_K_S.gguf","label":"TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)"}'>TinyLlama‑1.1B‑Chat Q3_K_S (≈500 MB)</option>
+        <!-- Your original SmolLM2 360M options (kept) -->
+        <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q4_0.gguf","label":"SmolLM2‑360M Q4_0 (≈229 MB)"}'>SmolLM2‑360M Q4_0 (≈229 MB)</option>
+        <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q3_K_S.gguf","label":"SmolLM2‑360M Q3_K_S (≈219 MB, faster)"}'>SmolLM2‑360M Q3_K_S (≈219 MB, faster)</option>
+        <option value='{"id":"QuantFactory/SmolLM-360M-GGUF","file":"SmolLM-360M.Q2_K.gguf","label":"SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)"}'>SmolLM2‑360M Q2_K (≈200 MB, min RAM / quality drop)</option>
+        <!-- Custom (use for Gemma‑3‑270M when a public GGUF exists) -->
+        <option value='{"custom":true,"label":"Custom HF GGUF (e.g., Gemma‑3‑270M)"}'>Custom HF GGUF (e.g., Gemma‑3‑270M)</option>
       </select>
+      <details id="customBox">
+        <summary class="tiny">Custom GGUF (paste HF repo + file)</summary>
+        <div class="row">
+          <label class="tiny">HF repo id</label>
+          <input id="customRepo" type="text" placeholder="e.g. google/gemma-3-270m-GGUF (when available)" style="width:280px" />
+          <label class="tiny">file</label>
+          <input id="customFile" type="text" placeholder="e.g. gemma-3-270m.Q4_0.gguf" style="width:240px" />
+        </div>
+        <div class="note">Note: official <a href="https://huggingface.co/google/gemma-3-270m" target="_blank" rel="noreferrer">Gemma‑3‑270M</a> is the base HF repo. A ready‑to‑use public GGUF is now available at <a href="https://huggingface.co/ggml-org/gemma-3-270m-GGUF" target="_blank" rel="noreferrer">ggml‑org/gemma‑3‑270m‑GGUF</a> (currently providing <code>gemma-3-270m-Q8_0.gguf</code> ≈292 MB). For maximum speed on low‑RAM phones, the OpenELM‑270M‑Instruct Q3_K_S option above is even lighter, but Gemma‑3‑270M offers strong quality for its size.</div>
+      </details>
       <div class="row">
         <label>Max new tokens</label>
         <input id="nPredict" type="number" min="1" max="512" step="1" value="128" />
       </div>
       <div class="row">
         <label>Temp</label><input id="temp" type="number" min="0" max="2" step="0.1" value="0.7" style="width:80px" />
+        <label>Top‑p</label><input id="topp" type="number" min="0" max="1" step="0.05" value="0.9" style="width:80px" />
+        <label>Top‑k</label><input id="topk" type="number" min="1" max="100" step="1" value="40" style="width:80px" />
       </div>
       <div class="spacer"></div>
     const $send  = document.getElementById('sendBtn');
     const $stop  = document.getElementById('stopBtn');
     const $iso   = document.getElementById('isoNote');
+    const $customBox = document.getElementById('customBox');
+    const $customRepo = document.getElementById('customRepo');
+    const $customFile = document.getElementById('customFile');
     // ——— State ———
     const decoder = new TextDecoder();
     let eotToken = -1;
     let sysPrompt = "You are a helpful, concise assistant. Keep answers short and clear.";
+    // Keep RAM low for mobile: small context + FP16 V‑cache (WASM safe)
     const LOAD_CONFIG = {
       n_ctx: 768,
       n_batch: 48,
+      cache_type_k: "q4_0",   // int4 K cache: reduces RAM without flash_attn
+      cache_type_v: "f16",    // IMPORTANT: V cache quant requires flash_attn; not available in WASM
+      flash_attn: false,
       progressCallback: ({ loaded, total }) => {
         const pct = (total && total > 0) ? Math.round(loaded / total * 100) : 0;
         $prog.style.width = pct + '%';
     function noteIsolation() {
       if (!crossOriginIsolated) {
+        $iso.innerHTML = 'Single‑thread mode (serve with COOP/COEP for multithread)';
       } else {
+        $iso.textContent = 'Cross‑origin isolated: multithread on';
       }
     }
     noteIsolation();
       messages.length = 0; messages.push(...kept);
     }
+    function getSelectedModel() {
+      const parsed = JSON.parse($model.value);
+      if (parsed.custom) {
+        const id = ($customRepo.value || '').trim();
+        const file = ($customFile.value || '').trim();
+        if (!id || !file) throw new Error('Enter HF repo id and GGUF file for custom model.');
+        return { id, file, label: `Custom: ${id}/${file}` };
+      }
+      return parsed;
+    }
     async function ensureLoaded() {
       if (loaded) return;
       $prog.style.width = '0%';
+      const choice = getSelectedModel();
+      ui.setStats(`Fetching ${choice.file}…`);
+      try {
+        await wllama.loadModelFromHF(choice.id, choice.file, LOAD_CONFIG);
+      } catch (e) {
+        // Common causes: gated repo, missing file, or CORS
+        throw new Error(`Load failed for ${choice.id}/${choice.file}. If the repo is gated or lacks CORS, try a public mirror / different quant. Details: ${e?.message || e}`);
+      }
       loaded = true;
       eotToken = wllama.getEOT();
       const meta = await wllama.getModelMetadata();
       const ctx = wllama.getLoadedContextInfo();
       const thr = wllama.getNumThreads?.() ?? 1;
+      ui.setStats(`Loaded ${choice.file} • ${meta.n_params?.toLocaleString?.() || ''} params • ctx ${ctx.n_ctx} • threads ${thr}`);
       $load.disabled = true; $unload.disabled = false;
     }
     document.getElementById('unloadBtn').addEventListener('click', unloadModel);
     document.getElementById('stopBtn').addEventListener('click', () => aborter?.abort());
+    $model.addEventListener('change', () => {
+      const isCustom = JSON.parse($model.value).custom === true;
+      $customBox.open = isCustom;
+    });
     $form.addEventListener('submit', async (ev) => {
       ev.preventDefault();
       const text = ($input.value || '').trim();
       }
     });
+    // Enter‑to‑send on mobile; Shift+Enter for newline
     $input.addEventListener('keydown', (e) => {
       if (e.key === 'Enter' && !e.shiftKey) {
         e.preventDefault();
   </script>
   <!--
+    What changed:
+    • Added multiple small, publicly downloadable GGUFs with CORS‑friendly repos:
+      - OpenELM‑270M‑Instruct (Q3_K_S / Q4_K_M) — ~134‑175 MB
+      - SmolLM‑135M‑Instruct (Q3_K_S) — ~88 MB
+      - SmolLM‑360M‑Instruct (Q3_K_S) — ~219 MB
+      - Qwen2.5‑0.5B‑Instruct (Q3_K_M / Q4_K_M) — ~432/491 MB
+      - TinyLlama‑1.1B‑Chat (Q3_K_S) — ~500 MB (optional, heavier but strong)
+      - Kept your SmolLM2‑360M options
+    • Added a "Custom HF GGUF" path so you can paste a Gemma‑3‑270M GGUF when a public mirror appears. The official repo is gated and not GGUF; direct download from the gated repo is unlikely to work in‑browser.
+    • Kept V‑cache at f16 and disabled flash_attn to avoid the llama.cpp error "V cache quantization requires flash_attn" in WASM.
+    • Modest n_ctx (768) and n_batch (48) to keep RAM low on phones.
+    • Pinned @wllama/wllama to 2.3.1 and referenced explicit .wasm URLs (no +esm).
+    Tips:
+    • For fastest sampling on low‑RAM phones: prefer Q3_K_S quants (OpenELM‑270M‑Instruct Q3_K_S or SmolLM‑135M‑Instruct Q3_K_S).
+    • For a nice quality bump still under ~500 MB: Qwen2.5‑0.5B‑Instruct Q3_K_M.
+    • Serve with COOP/COEP headers if you want multi‑threading.
   -->
 </body>
 </html>