Spaces:
Running
on
A10G
Running
on
A10G
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,7 +19,7 @@ from textwrap import dedent
|
|
| 19 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 20 |
|
| 21 |
def generate_importance_matrix(model_path, train_data_path):
|
| 22 |
-
imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
|
| 23 |
|
| 24 |
os.chdir("llama.cpp")
|
| 25 |
|
|
@@ -146,9 +146,9 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 146 |
quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
|
| 147 |
quantized_gguf_path = quantized_gguf_name
|
| 148 |
if use_imatrix:
|
| 149 |
-
quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
|
| 150 |
else:
|
| 151 |
-
quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
|
| 152 |
result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
|
| 153 |
if result.returncode != 0:
|
| 154 |
raise Exception(f"Error quantizing: {result.stderr}")
|
|
@@ -186,7 +186,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 186 |
|
| 187 |
### CLI:
|
| 188 |
```bash
|
| 189 |
-
llama --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
| 190 |
```
|
| 191 |
|
| 192 |
### Server:
|
|
@@ -208,11 +208,11 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
| 208 |
|
| 209 |
Step 3: Run inference through the main binary.
|
| 210 |
```
|
| 211 |
-
./
|
| 212 |
```
|
| 213 |
or
|
| 214 |
```
|
| 215 |
-
./server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
| 216 |
```
|
| 217 |
"""
|
| 218 |
)
|
|
|
|
| 19 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 20 |
|
| 21 |
def generate_importance_matrix(model_path, train_data_path):
|
| 22 |
+
imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
|
| 23 |
|
| 24 |
os.chdir("llama.cpp")
|
| 25 |
|
|
|
|
| 146 |
quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
|
| 147 |
quantized_gguf_path = quantized_gguf_name
|
| 148 |
if use_imatrix:
|
| 149 |
+
quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
|
| 150 |
else:
|
| 151 |
+
quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
|
| 152 |
result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
|
| 153 |
if result.returncode != 0:
|
| 154 |
raise Exception(f"Error quantizing: {result.stderr}")
|
|
|
|
| 186 |
|
| 187 |
### CLI:
|
| 188 |
```bash
|
| 189 |
+
llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
| 190 |
```
|
| 191 |
|
| 192 |
### Server:
|
|
|
|
| 208 |
|
| 209 |
Step 3: Run inference through the main binary.
|
| 210 |
```
|
| 211 |
+
./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
| 212 |
```
|
| 213 |
or
|
| 214 |
```
|
| 215 |
+
./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
| 216 |
```
|
| 217 |
"""
|
| 218 |
)
|