ZeroGPU-LLM-Inference

Running

Luigi commited on Apr 10

Commit

a7fdfe6

1 Parent(s): 06a162a

Enable speculattive decoding

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os, gc, shutil, re
 from itertools import islice
@@ -139,13 +140,15 @@ def try_load_model(path):
         return Llama(
             model_path=path,
             n_ctx=512,           # Reduced context window to save memory
-            n_threads=1,         # Fewer threads for resource-constrained environments
             n_threads_batch=1,
-            n_batch=2,           # Lower batch size to conserve memory
             n_gpu_layers=0,
             use_mlock=False,
             use_mmap=True,
             verbose=False,
         )
     except Exception as e:
         return str(e)

 import streamlit as st
 from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 from huggingface_hub import hf_hub_download
 import os, gc, shutil, re
 from itertools import islice
         return Llama(
             model_path=path,
             n_ctx=512,           # Reduced context window to save memory
+            n_threads=2,         # Fewer threads for resource-constrained environments
             n_threads_batch=1,
+            n_batch=64,           # Lower batch size to conserve memory
             n_gpu_layers=0,
             use_mlock=False,
             use_mmap=True,
             verbose=False,
+            logits_all=True,
+            draft_model=LlamaPromptLookupDecoding(num_pred_tokens=2),
         )
     except Exception as e:
         return str(e)