Files changed (1) hide show
  1. README.md +39 -20
README.md CHANGED
@@ -187,14 +187,22 @@ More usage can be found [here](https://docs.sglang.ai/basic_usage/send_request.h
187
 
188
  #### Environment Preparation
189
 
190
- Since the Pull Request (PR) has not been submitted to the vLLM community at this stage, please prepare the environment by following the steps below:
 
 
191
  ```shell
192
- pip install torch==2.7.0 torchvision==0.22.0
 
193
  ```
194
 
195
- Then you should install our vLLM wheel package:
196
  ```shell
197
- pip install https://media.githubusercontent.com/media/inclusionAI/Ring-V2/refs/heads/main/hybrid_linear/whls/vllm-0.8.5%2Bcuda12_8_gcc10_2_1-cp310-cp310-linux_x86_64.whl --no-deps --force-reinstall
 
 
 
 
 
198
  ```
199
 
200
  #### Offline Inference
@@ -203,28 +211,39 @@ pip install https://media.githubusercontent.com/media/inclusionAI/Ring-V2/refs/h
203
  from transformers import AutoTokenizer
204
  from vllm import LLM, SamplingParams
205
 
206
- tokenizer = AutoTokenizer.from_pretrained("inclusionAI/Ring-flash-linear-2.0")
207
-
208
- sampling_params = SamplingParams(temperature=0.6, top_p=1.0, max_tokens=8192)
 
209
 
210
- llm = LLM(model="inclusionAI/Ring-flash-linear-2.0", dtype='bfloat16', enable_prefix_caching=False)
211
- prompt = "Give me a short introduction to large language models."
212
- messages = [
213
- {"role": "user", "content": prompt}
214
- ]
215
-
216
- text = tokenizer.apply_chat_template(
217
- messages,
218
- tokenize=False,
219
- add_generation_prompt=True
220
- )
221
- outputs = llm.generate([text], sampling_params)
 
 
 
 
 
222
  ```
223
 
224
  #### Online Inference
225
  ```shell
226
  vllm serve inclusionAI/Ring-flash-linear-2.0 \
227
- --tensor-parallel-size 4 \
 
228
  --gpu-memory-utilization 0.90 \
 
229
  --no-enable-prefix-caching
 
230
  ```
 
 
 
187
 
188
  #### Environment Preparation
189
 
190
+ Since the Pull Request (PR) has not been submitted to the vLLM community at this stage, please prepare the environment by following the steps below.
191
+
192
+ First, create a Conda environment with Python 3.10 and CUDA 12.8:
193
  ```shell
194
+ conda create -n vllm python=3.10
195
+ conda activate vllm
196
  ```
197
 
198
+ Next, install our vLLM wheel package:
199
  ```shell
200
+ pip install https://media.githubusercontent.com/media/zheyishine/vllm_whl/refs/heads/main/vllm-0.8.5.post2.dev28%2Bgd327eed71.cu128-cp310-cp310-linux_x86_64.whl --force-reinstall
201
+ ```
202
+
203
+ Finally, install compatible versions of transformers after vLLM is installed:
204
+ ```shell
205
+ pip install transformers==4.51.1
206
  ```
207
 
208
  #### Offline Inference
 
211
  from transformers import AutoTokenizer
212
  from vllm import LLM, SamplingParams
213
 
214
+ if __name__ == '__main__':
215
+ tokenizer = AutoTokenizer.from_pretrained("inclusionAI/Ring-flash-linear-2.0", trust_remote_code=True)
216
+
217
+ sampling_params = SamplingParams(temperature=0.6, top_p=1.0, max_tokens=16384)
218
 
219
+ # use `max_num_seqs=1` without concurrency
220
+ llm = LLM(model="inclusionAI/Ring-flash-linear-2.0", dtype='auto', enable_prefix_caching=False, max_num_seqs=128)
221
+
222
+
223
+ prompt = "Give me a short introduction to large language models."
224
+ messages = [
225
+ {"role": "user", "content": prompt}
226
+ ]
227
+
228
+ text = tokenizer.apply_chat_template(
229
+ messages,
230
+ tokenize=False,
231
+ add_generation_prompt=True
232
+ )
233
+ outputs = llm.generate([text], sampling_params)
234
+ for output in outputs:
235
+ print(output.outputs[0].text)
236
  ```
237
 
238
  #### Online Inference
239
  ```shell
240
  vllm serve inclusionAI/Ring-flash-linear-2.0 \
241
+ --tensor-parallel-size 2 \
242
+ --pipeline-parallel-size 1 \
243
  --gpu-memory-utilization 0.90 \
244
+ --max-num-seqs 128 \
245
  --no-enable-prefix-caching
246
+ --api-key your-api-key
247
  ```
248
+
249
+