vLLM docker run
version: '3.3'
services:
  pdf-parser:
    image: vllm/vllm-openai:v0.7.0
    container_name: pdf-parser
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ['3']
              capabilities: [gpu]
    # environment:
    #   - CUDA_VISIBLE_DEVICES=0
    #   - VLLM_ATTENTION_BACKEND=FLASHINFER
    ports:
      - "${PARSER_PORT}:${PARSER_PORT}"
    volumes:
      - type: bind
        source: ${HF_CACHE_DIR}
        target: /root/.cache/huggingface
    command: --served-model-name ${PARSER_MODEL_NAME} --model ds4sd/SmolDocling-256M-preview --dtype bfloat16 --host 0.0.0.0 --port ${PARSER_PORT} --api-key ${PARSER_API_KEY} --max-model-len 8192 --gpu-memory-utilization 0.5
from openai import OpenAI
import base64
from PIL import Image
import io
# Initialize the client
# Initialize the client
# Initialize the client
client = OpenAI(
    api_key="dummy",
    base_url="http://210.211.99.4:8082/v1"
)
model_name = "SmolDocling"
PROMPT_TEXT = "Convert page to Docling."
prompt = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
# Function to encode image to base64
def encode_image(image_path, max_size=512):
    # Open the image
    img = Image.open(image_path)
    
    # Resize if larger than max_size
    width, height = img.size
    if max(width, height) > max_size:
        # Calculate new dimensions while preserving aspect ratio
        scale = max_size / max(width, height)
        new_width = int(width * scale)
        new_height = int(height * scale)
        img = img.resize((new_width, new_height), Image.LANCZOS)
    print(f"Image size: {img.size}")
    
    # Convert to bytes
    buffer = io.BytesIO()
    img.save(buffer, format="JPEG" if img.format == "JPEG" else "PNG")
    buffer.seek(0)
    
    # Encode to base64
    return base64.b64encode(buffer.read()).decode("utf-8")
# Path to your image
image_path = "./pdf_images/page_1.png"
# Get the base64 string
base64_image = encode_image(image_path)
# Create the payload
response = client.chat.completions.create(
    model=model_name,
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": "https://stg-s3.distilled.ai/distilled/images/page_1.png"}},
                {"type": "text", "text": f"{PROMPT_TEXT}"}
            ]
        }
    ],
    max_tokens=300
)
# Print the response
print(response.choices[0].message.content)
	
		
	
	
		Logs
```
	
BadRequestError                           Traceback (most recent call last)
Cell In[4], line 50
     47 base64_image = encode_image(image_path)
     49 # Create the payload
---> 50 response = client.chat.completions.create(
     51     model=model_name,
     52     messages=[
     53         {
     54             "role": "user",
     55             "content": [
     56                 {"type": "image_url", "image_url": {"url": "https://stg-s3.distilled.ai/distilled/images/page_1.png"}},
     57                 {"type": "text", "text": f"{PROMPT_TEXT}"}
     58             ]
     59         }
     60     ],
     61     max_tokens=300
     62 )
     64 # Print the response
     65 print(response.choices[0].message.content)
File ~/.venv/lib/python3.10/site-packages/openai/_utils/_utils.py:279, in required_args..inner..wrapper(*args, **kwargs)
    277             msg = f"Missing required argument: {quote(missing[0])}"
    278     raise TypeError(msg)
--> 279 return func(*args, **kwargs)
File ~/.venv/lib/python3.10/site-packages/openai/resources/chat/completions/completions.py:879, in Completions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout)
    837 @required_args(["messages", "model"], ["messages", "model", "stream"])
    838 def create(
    839     self,
   (...)
    876     timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
    877 ) -> ChatCompletion | Stream[ChatCompletionChunk]:
    878     validate_response_format(response_format)
--> 879     return self._post(
    880         "/chat/completions",
    881         body=maybe_transform(
    882             {
    883                 "messages": messages,
    884                 "model": model,
    885                 "audio": audio,
    886                 "frequency_penalty": frequency_penalty,
    887                 "function_call": function_call,
    888                 "functions": functions,
    889                 "logit_bias": logit_bias,
    890                 "logprobs": logprobs,
    891                 "max_completion_tokens": max_completion_tokens,
    892                 "max_tokens": max_tokens,
    893                 "metadata": metadata,
    894                 "modalities": modalities,
    895                 "n": n,
    896                 "parallel_tool_calls": parallel_tool_calls,
    897                 "prediction": prediction,
    898                 "presence_penalty": presence_penalty,
    899                 "reasoning_effort": reasoning_effort,
    900                 "response_format": response_format,
    901                 "seed": seed,
    902                 "service_tier": service_tier,
    903                 "stop": stop,
    904                 "store": store,
    905                 "stream": stream,
    906                 "stream_options": stream_options,
    907                 "temperature": temperature,
    908                 "tool_choice": tool_choice,
    909                 "tools": tools,
    910                 "top_logprobs": top_logprobs,
    911                 "top_p": top_p,
    912                 "user": user,
    913             },
    914             completion_create_params.CompletionCreateParams,
    915         ),
    916         options=make_request_options(
    917             extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
    918         ),
    919         cast_to=ChatCompletion,
    920         stream=stream or False,
    921         stream_cls=Stream[ChatCompletionChunk],
    922     )
File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:1296, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls)
   1282 def post(
   1283     self,
   1284     path: str,
   (...)
   1291     stream_cls: type[_StreamT] | None = None,
   1292 ) -> ResponseT | _StreamT:
   1293     opts = FinalRequestOptions.construct(
   1294         method="post", url=path, json_data=body, files=to_httpx_files(files), **options
   1295     )
-> 1296     return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:973, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls)
    970 else:
    971     retries_taken = 0
--> 973 return self._request(
    974     cast_to=cast_to,
    975     options=options,
    976     stream=stream,
    977     stream_cls=stream_cls,
    978     retries_taken=retries_taken,
    979 )
File ~/.venv/lib/python3.10/site-packages/openai/_base_client.py:1077, in SyncAPIClient._request(self, cast_to, options, retries_taken, stream, stream_cls)
   1074         err.response.read()
   1076     log.debug("Re-raising status error")
-> 1077     raise self._make_status_error_from_response(err.response) from None
   1079 return self._process_response(
   1080     cast_to=cast_to,
   1081     options=options,
   (...)
   1085     retries_taken=retries_taken,
   1086 )
BadRequestError: Error code: 400 - {'object': 'error', 'message': 'resolution_max_side cannot be larger than max_image_size', 'type': 'BadRequestError', 'param': None, 'code': 400}
```
This problem is that the vllm version is low, I used python 3.11 to install the latest vllm to solve it