Spaces:
Runtime error
Runtime error
| ######################## | |
| #install nginx | |
| #sudo apt update | |
| #sudo apt install nginx | |
| #sudo vi /etc/nginx/sites-available/default #edit | |
| #sudo systemctl start nginx | |
| ########################### | |
| #lauch local server | |
| cd /data/2024/1018chatbotarena/llama.cpp/download | |
| export CUDA_VISIBLE_DEVICES=0 | |
| python -m vllm.entrypoints.openai.api_server --model cyberagent/calm3-22b-chat \ | |
| --max-model-len 4096 --port 8011 \ | |
| --gpu-memory-utilization 0.4 --trust-remote-code \ | |
| --quantization bitsandbytes --load-format bitsandbytes \ | |
| --api-key $VLLM_API_KEY | |
| #vllm tanuki8 | |
| export CUDA_VISIBLE_DEVICES=0 | |
| python -m vllm.entrypoints.openai.api_server --model weblab-GENIAC/Tanuki-8B-dpo-v1.0 --max-model-len 4096 --port 8012 --gpu-memory-utilization 0.2 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY | |
| #llama.cpp swallow 8b | |
| export CUDA_VISIBLE_DEVICES=0 | |
| ../llama-server -m tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0.gguf --n_gpu_layers 100 --port 8010 | |
| #llmjp13b | |
| export CUDA_VISIBLE_DEVICES=0 | |
| ../llama-server -m llm-jp-3-13b-instruct-Q8_0.gguf --n_gpu_layers 100 --port 8016 | |
| #swallow70 | |
| export CUDA_VISIBLE_DEVICES=1 | |
| python -m vllm.entrypoints.openai.api_server --model tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1 --max-model-len 4096 --port 8019 --gpu-memory-utilization 0.6 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY | |
| #gemma | |
| #export CUDA_VISIBLE_DEVICES=1 | |
| #../llama-server -m gemma-2-2B-jpn-it-BF16.gguf --n_gpu_layers 100 --port 8020 | |
| #tanuki 8x8b | |
| export CUDA_VISIBLE_DEVICES=1 | |
| python -m vllm.entrypoints.openai.api_server --model team-hatakeyama-phase2/Tanuki-8x8B-dpo-v1.0-AWQ --max-model-len 4096 --port 8020 --gpu-memory-utilization 0.35 --trust-remote-code --quantization awq --api-key $VLLM_API_KEY | |
| ################### | |
| #server2 | |
| export CUDA_VISIBLE_DEVICES=0 | |
| python -m vllm.entrypoints.openai.api_server --model ibm-granite/granite-3.0-8b-instruct --max-model-len 4096 \ | |
| --port 8020 --gpu-memory-utilization 0.4 --trust-remote-code \ | |
| --quantization fp8 \ | |
| --api-key $VLLM_API_KEY | |
| ######################### | |
| #launch ngrok | |
| ngrok http http://localhost:8765 |