vLLM docker run

#14

by anhnh2002 - opened Mar 19

anhnh2002

Mar 19

version: '3.3'

services:
  pdf-parser:
    image: vllm/vllm-openai:v0.7.0
    container_name: pdf-parser
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['3']
              capabilities: [gpu]
    # environment:
    #   - CUDA_VISIBLE_DEVICES=0
    #   - VLLM_ATTENTION_BACKEND=FLASHINFER
    ports:
      - "${PARSER_PORT}:${PARSER_PORT}"
    volumes:
      - type: bind
        source: ${HF_CACHE_DIR}
        target: /root/.cache/huggingface
    command: --served-model-name ${PARSER_MODEL_NAME} --model ds4sd/SmolDocling-256M-preview --dtype bfloat16 --host 0.0.0.0 --port ${PARSER_PORT} --api-key ${PARSER_API_KEY} --max-model-len 8192 --gpu-memory-utilization 0.5

from openai import OpenAI
import base64
from PIL import Image
import io

# Initialize the client
# Initialize the client
# Initialize the client
client = OpenAI(
    api_key="dummy",
    base_url="http://210.211.99.4:8082/v1"
)

model_name = "SmolDocling"

PROMPT_TEXT = "Convert page to Docling."
prompt = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"

# Function to encode image to base64
def encode_image(image_path, max_size=512):
    # Open the image
    img = Image.open(image_path)
    
    # Resize if larger than max_size
    width, height = img.size
    if max(width, height) > max_size:
        # Calculate new dimensions while preserving aspect ratio
        scale = max_size / max(width, height)
        new_width = int(width * scale)
        new_height = int(height * scale)
        img = img.resize((new_width, new_height), Image.LANCZOS)

    print(f"Image size: {img.size}")
    
    # Convert to bytes
    buffer = io.BytesIO()
    img.save(buffer, format="JPEG" if img.format == "JPEG" else "PNG")
    buffer.seek(0)
    
    # Encode to base64
    return base64.b64encode(buffer.read()).decode("utf-8")

# Path to your image
image_path = "./pdf_images/page_1.png"

# Get the base64 string
base64_image = encode_image(image_path)

# Create the payload
response = client.chat.completions.create(
    model=model_name,
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": "https://stg-s3.distilled.ai/distilled/images/page_1.png"}},
                {"type": "text", "text": f"{PROMPT_TEXT}"}
            ]
        }
    ],
    max_tokens=300
)

# Print the response
print(response.choices[0].message.content)

Logs
```

BadRequestError Traceback (most recent call last)
Cell In[4], line 50
47 base64_image = encode_image(image_path)
49 # Create the payload
---> 50 response = client.chat.completions.create(
51 model=model_name,
52 messages=[
53 {
54 "role": "user",
55 "content": [
56 {"type": "image_url", "image_url": {"url": "https://stg-s3.distilled.ai/distilled/images/page_1.png"}},
57 {"type": "text", "text": f"{PROMPT_TEXT}"}
58 ]
59 }
60 ],
61 max_tokens=300
62 )
64 # Print the response
65 print(response.choices[0].message.content)

File ~/.venv/lib/python3.10/site-packages/openai/_utils/_utils.py:279, in required_args..inner..wrapper(*args, **kwargs)
277 msg = f"Missing required argument: {quote(missing[0])}"
278 raise TypeError(msg)
--> 279 return func(*args, **kwargs)

File ~/.venv/lib/python3.10/site-packages/openai/resources/chat/completions/completions.py:879, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout)
837 @required_args(["messages", "model"], ["messages", "model", "stream"])
838 def create(
839 self,
(...)
876 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
877 ) -> ChatCompletion | Stream[ChatCompletionChunk]:
878 validate_response_format(response_format)
--> 879 return self._post(
880 "/chat/completions",
881 body=maybe_transform(
882 {
883 "messages": messages,
884 "model": model,
885 "audio": audio,
886 "frequency_penalty": frequency_penalty,
887 "function_call": function_call,
888 "functions": functions,
889 "logit_bias": logit_bias,
890 "logprobs": logprobs,
891 "max_completion_tokens": max_completion_tokens,
892 "max_tokens": max_tokens,
893 "metadata": metadata,
894 "modalities": modalities,
895 "n": n,
896 "parallel_tool_calls": parallel_tool_calls,
897 "prediction": prediction,
898 "presence_penalty": presence_penalty,
899 "reasoning_effort": reasoning_effort,
900 "response_format": response_format,
901 "seed": seed,
902 "service_tier": service_tier,
903 "stop": stop,
904 "store": store,
905 "stream": stream,
906 "stream_options": stream_options,
907 "temperature": temperature,
908 "tool_choice": tool_choice,
909 "tools": tools,
910 "top_logprobs": top_logprobs,
911 "top_p": top_p,
912 "user": user,
913 },
914 completion_create_params.CompletionCreateParams,
915 ),
916 options=make_request_options(
917 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
918 ),
919 cast_to=ChatCompletion,
920 stream=stream or False,
921 stream_cls=Stream[ChatCompletionChunk],
922 )

File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:1296, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
1282 def post(
1283 self,
1284 path: str,
(...)
1291 stream_cls: type[_StreamT] | None = None,
1292 ) -> ResponseT | _StreamT:
1293 opts = FinalRequestOptions.construct(
1294 method="post", url=path, json_data=body, files=to_httpx_files(files), **options
1295 )
-> 1296 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))

File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:973, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
970 else:
971 retries_taken = 0
--> 973 return self._request(
974 cast_to=cast_to,
975 options=options,
976 stream=stream,
977 stream_cls=stream_cls,
978 retries_taken=retries_taken,
979 )

File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:1077, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
1074 err.response.read()
1076 log.debug("Re-raising status error")
-> 1077 raise self._make_status_error_from_response(err.response) from None
1079 return self._process_response(
1080 cast_to=cast_to,
1081 options=options,
(...)
1085 retries_taken=retries_taken,
1086 )

BadRequestError: Error code: 400 - {'object': 'error', 'message': 'resolution_max_side cannot be larger than max_image_size', 'type': 'BadRequestError', 'param': None, 'code': 400}
```

xiaohan1

Mar 20

This problem is that the vllm version is low, I used python 3.11 to install the latest vllm to solve it

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment

vLLM docker run

Logs```

Logs
```