Spaces:
Running
Running
Enable speculattive decoding
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from llama_cpp import Llama
|
|
|
|
| 3 |
from huggingface_hub import hf_hub_download
|
| 4 |
import os, gc, shutil, re
|
| 5 |
from itertools import islice
|
|
@@ -139,13 +140,15 @@ def try_load_model(path):
|
|
| 139 |
return Llama(
|
| 140 |
model_path=path,
|
| 141 |
n_ctx=512, # Reduced context window to save memory
|
| 142 |
-
n_threads=
|
| 143 |
n_threads_batch=1,
|
| 144 |
-
n_batch=
|
| 145 |
n_gpu_layers=0,
|
| 146 |
use_mlock=False,
|
| 147 |
use_mmap=True,
|
| 148 |
verbose=False,
|
|
|
|
|
|
|
| 149 |
)
|
| 150 |
except Exception as e:
|
| 151 |
return str(e)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from llama_cpp import Llama
|
| 3 |
+
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
|
| 4 |
from huggingface_hub import hf_hub_download
|
| 5 |
import os, gc, shutil, re
|
| 6 |
from itertools import islice
|
|
|
|
| 140 |
return Llama(
|
| 141 |
model_path=path,
|
| 142 |
n_ctx=512, # Reduced context window to save memory
|
| 143 |
+
n_threads=2, # Fewer threads for resource-constrained environments
|
| 144 |
n_threads_batch=1,
|
| 145 |
+
n_batch=64, # Lower batch size to conserve memory
|
| 146 |
n_gpu_layers=0,
|
| 147 |
use_mlock=False,
|
| 148 |
use_mmap=True,
|
| 149 |
verbose=False,
|
| 150 |
+
logits_all=True,
|
| 151 |
+
draft_model=LlamaPromptLookupDecoding(num_pred_tokens=2),
|
| 152 |
)
|
| 153 |
except Exception as e:
|
| 154 |
return str(e)
|