vllm.entrypoints.chat_utils ¶

PROMPT_EMBEDS_PLACEHOLDER_TOKEN `module-attribute` ¶

PROMPT_EMBEDS_PLACEHOLDER_TOKEN: Final[str] = (
    "<prompt_embeds>"
)

The special token used as a placeholder for each embedding position during chat template rendering.

Registered as an additional special token when --enable-prompt-embeds is set. See _ensure_prompt_embeds_placeholder_token in vllm/renderers/hf.py.

AsyncMultiModalContentParser ¶

Bases: BaseMultiModalContentParser

Source code in vllm/entrypoints/chat_utils.py

class AsyncMultiModalContentParser(BaseMultiModalContentParser):
    def __init__(
        self,
        tracker: AsyncMultiModalItemTracker,
        mm_processor_kwargs: dict[str, Any] | None = None,
    ) -> None:
        super().__init__()

        self._tracker = tracker
        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
            envs.VLLM_MEDIA_CONNECTOR,
            media_io_kwargs=tracker.media_io_kwargs,
            allowed_local_media_path=tracker.allowed_local_media_path,
            allowed_media_domains=tracker.allowed_media_domains,
        )
        self._mm_processor_kwargs: dict[str, Any] | None = mm_processor_kwargs

    @property
    def model_config(self) -> ModelConfig:
        return self._tracker.model_config

    @override
    def parse_prompt_embeds(self, data: str) -> None:
        """Schedule async prompt embeds decode and store the coroutine in the tracker.

        Like the sync variant, emits a single sentinel `PROMPT_EMBEDS_PLACEHOLDER_TOKEN`
        per content part. Unlike the sync variant, the tensor decode is deferred to a
        thread-pool executor via `safe_load_prompt_embeds_async`.
        """
        if not self.model_config.enable_prompt_embeds:
            raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)

        coro = self._load_prompt_embeds_async(data.encode())
        self._tracker.add("prompt_embeds", coro)
        self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)

    async def _load_prompt_embeds_async(
        self, data_bytes: bytes
    ) -> tuple[torch.Tensor, None]:
        # Second tuple slot fills the tracker's generic `(item, uuid | None)`
        # contract. prompt_embeds has no UUID concept, so it's always `None`.
        tensor = await safe_load_prompt_embeds_async(self.model_config, data_bytes)
        return tensor, None

    async def _image_with_uuid_async(self, image_url: str | None, uuid: str | None):
        image = (
            await self._connector.fetch_image_async(image_url) if image_url else None
        )
        return image, uuid

    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
        coro = self._image_with_uuid_async(image_url, uuid)

        placeholder = self._tracker.add("image", coro)
        self._add_placeholder("image", placeholder)

    def parse_image_embeds(
        self,
        image_embeds: str | dict[str, str] | None,
        uuid: str | None = None,
    ) -> None:
        mm_config = self.model_config.get_multimodal_config()
        if not mm_config.enable_mm_embeds:
            raise ValueError(
                "You must set `--enable-mm-embeds` to input `image_embeds`"
            )

        future = asyncio.Future[
            tuple[torch.Tensor | dict[str, torch.Tensor] | None, str | None]
        ]()

        if isinstance(image_embeds, dict):
            embeds = {
                k: self._connector.fetch_image_embedding(v)
                for k, v in image_embeds.items()
            }
            future.set_result((embeds, uuid))

        if isinstance(image_embeds, str):
            embedding = self._connector.fetch_image_embedding(image_embeds)
            future.set_result((embedding, uuid))

        if image_embeds is None:
            future.set_result((None, uuid))

        placeholder = self._tracker.add("image_embeds", future)
        self._add_placeholder("image", placeholder)

    def parse_audio_embeds(
        self,
        audio_embeds: str | dict[str, str] | None,
        uuid: str | None = None,
    ) -> None:
        mm_config = self.model_config.get_multimodal_config()
        if not mm_config.enable_mm_embeds:
            raise ValueError(
                "You must set `--enable-mm-embeds` to input `audio_embeds`"
            )

        future = asyncio.Future[
            tuple[torch.Tensor | dict[str, torch.Tensor] | None, str | None]
        ]()

        if isinstance(audio_embeds, dict):
            embeds = {
                k: self._connector.fetch_audio_embedding(v)
                for k, v in audio_embeds.items()
            }
            future.set_result((embeds, uuid))

        if isinstance(audio_embeds, str):
            embedding = self._connector.fetch_audio_embedding(audio_embeds)
            future.set_result((embedding, uuid))

        if audio_embeds is None:
            future.set_result((None, uuid))

        placeholder = self._tracker.add("audio_embeds", future)
        self._add_placeholder("audio", placeholder)

    def parse_image_pil(
        self,
        image_pil: Image.Image | None,
        uuid: str | None = None,
    ) -> None:
        future = asyncio.Future[tuple[Image.Image | None, str | None]]()
        if image_pil:
            future.set_result((image_pil, uuid))
        else:
            future.set_result((None, uuid))

        placeholder = self._tracker.add("image", future)
        self._add_placeholder("image", placeholder)

    async def _audio_with_uuid_async(self, audio_url: str | None, uuid: str | None):
        audio = (
            await self._connector.fetch_audio_async(audio_url) if audio_url else None
        )
        return audio, uuid

    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
        coro = self._audio_with_uuid_async(audio_url, uuid)

        placeholder = self._tracker.add("audio", coro)
        self._add_placeholder("audio", placeholder)

    def parse_input_audio(
        self, input_audio: InputAudio | None, uuid: str | None = None
    ) -> None:
        if input_audio:
            audio_data = input_audio.get("data", "")
            audio_format = input_audio.get("format", "")
            if audio_data:
                audio_url = f"data:audio/{audio_format};base64,{audio_data}"
            else:
                # If a UUID is provided, audio data may be empty.
                audio_url = None
        else:
            audio_url = None

        return self.parse_audio(audio_url, uuid)

    async def _video_with_uuid_async(self, video_url: str | None, uuid: str | None):
        video = (
            await self._connector.fetch_video_async(video_url) if video_url else None
        )
        return video, uuid

    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
        coro = self._video_with_uuid_async(video_url, uuid)

        placeholder = self._tracker.add("video", coro)
        self._add_placeholder("video", placeholder)

        # Extract audio from video if use_audio_in_video is True
        if (
            video_url
            and self._mm_processor_kwargs
            and self._mm_processor_kwargs.get("use_audio_in_video", False)
        ):
            audio_coro = self._audio_with_uuid_async(video_url, uuid)
            audio_placeholder = self._tracker.add("audio", audio_coro)
            self._add_placeholder("audio", audio_placeholder)

parse_prompt_embeds ¶

parse_prompt_embeds(data: str) -> None

Schedule async prompt embeds decode and store the coroutine in the tracker.

Like the sync variant, emits a single sentinel PROMPT_EMBEDS_PLACEHOLDER_TOKEN per content part. Unlike the sync variant, the tensor decode is deferred to a thread-pool executor via safe_load_prompt_embeds_async.

Source code in vllm/entrypoints/chat_utils.py

@override
def parse_prompt_embeds(self, data: str) -> None:
    """Schedule async prompt embeds decode and store the coroutine in the tracker.

    Like the sync variant, emits a single sentinel `PROMPT_EMBEDS_PLACEHOLDER_TOKEN`
    per content part. Unlike the sync variant, the tensor decode is deferred to a
    thread-pool executor via `safe_load_prompt_embeds_async`.
    """
    if not self.model_config.enable_prompt_embeds:
        raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)

    coro = self._load_prompt_embeds_async(data.encode())
    self._tracker.add("prompt_embeds", coro)
    self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)

AudioURL ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class AudioURL(TypedDict, total=False):
    url: Required[str]
    """
    Either a URL of the audio or a data URL with base64 encoded audio data.
    """

url `instance-attribute` ¶

url: Required[str]

Either a URL of the audio or a data URL with base64 encoded audio data.

BaseMultiModalItemTracker ¶

Bases: ABC, Generic[_T]

Tracks multi-modal items in a given request and ensures that the number of multi-modal items in a given request does not exceed the configured maximum per prompt.

Source code in vllm/entrypoints/chat_utils.py

class BaseMultiModalItemTracker(ABC, Generic[_T]):
    """
    Tracks multi-modal items in a given request and ensures that the number
    of multi-modal items in a given request does not exceed the configured
    maximum per prompt.
    """

    def __init__(
        self,
        model_config: ModelConfig,
        media_io_kwargs: dict[str, dict[str, Any]] | None = None,
    ):
        super().__init__()

        self._model_config = model_config
        self._media_io_kwargs = media_io_kwargs

        self._items_by_modality = defaultdict[str, list[_T]](list)
        # Track original modality for each vision_chunk item (image or video)
        self._modality_order = defaultdict[str, list[str]](list)

    @cached_property
    def use_unified_vision_chunk_modality(self) -> bool:
        """Check if model uses unified vision_chunk modality for images/videos."""
        return getattr(self._model_config.hf_config, "use_unified_vision_chunk", False)

    @property
    def model_config(self) -> ModelConfig:
        return self._model_config

    @cached_property
    def model_cls(self) -> type[SupportsMultiModal]:
        from vllm.model_executor.model_loader import get_model_cls

        model_cls = get_model_cls(self.model_config)
        return cast(type[SupportsMultiModal], model_cls)

    @property
    def media_io_kwargs(self) -> dict[str, dict[str, Any]] | None:
        return self._media_io_kwargs or (
            self._model_config.multimodal_config.media_io_kwargs
            if self._model_config.multimodal_config
            else None
        )

    @property
    def allowed_local_media_path(self):
        return self._model_config.allowed_local_media_path

    @property
    def allowed_media_domains(self):
        return self._model_config.allowed_media_domains

    @property
    def mm_registry(self):
        return MULTIMODAL_REGISTRY

    @cached_property
    def mm_processor(self):
        return self.mm_registry.create_processor(self.model_config)

    def add(self, modality: ModalityStr, item: _T) -> str | None:
        """
        Add a multi-modal item to the current prompt and returns the
        placeholder string to use, if any.

        An optional uuid can be added which serves as a unique identifier of the
        media.

        Note:
            `prompt_embeds` bypass MM-processor validation because they are
            pre-computed embeddings that do not go through any HF processor, encoder,
            or model-specific placeholder logic. The corresponding placeholder string is
            managed by the parser via `_add_placeholder`, so we return None here.
        """
        if modality == "prompt_embeds":
            self._items_by_modality["prompt_embeds"].append(item)
            return None

        input_modality = modality.replace("_embeds", "")
        original_modality = modality
        use_vision_chunk = (
            self.use_unified_vision_chunk_modality
            and original_modality in ["video", "image"]
        )

        # If use_unified_vision_chunk_modality is enabled,
        # map image/video to vision_chunk
        if use_vision_chunk:
            # To avoid validation fail
            # because models with use_unified_vision_chunk_modality=True
            # will only accept vision_chunk modality.
            input_modality = "vision_chunk"
            num_items = len(self._items_by_modality[input_modality]) + 1
        else:
            num_items = len(self._items_by_modality[original_modality]) + 1

        mm_config = self.model_config.multimodal_config
        if (
            mm_config is not None
            and mm_config.enable_mm_embeds
            and mm_config.get_limit_per_prompt(input_modality) == 0
            and original_modality.endswith("_embeds")
        ):
            # Skip validation: embeddings bypass limit when enable_mm_embeds=True
            pass
        else:
            self.mm_processor.info.validate_num_items(input_modality, num_items)

        # Track original modality for vision_chunk items
        if use_vision_chunk:
            self._items_by_modality[input_modality].append(item)  # type: ignore
            self._modality_order["vision_chunk"].append(original_modality)
        else:
            self._items_by_modality[original_modality].append(item)

        return self.model_cls.get_placeholder_str(modality, num_items)

    @abstractmethod
    def create_parser(
        self, mm_processor_kwargs: dict[str, Any] | None = None
    ) -> "BaseMultiModalContentParser":
        raise NotImplementedError

use_unified_vision_chunk_modality `cached` `property` ¶

use_unified_vision_chunk_modality: bool

Check if model uses unified vision_chunk modality for images/videos.

add ¶

add(modality: ModalityStr, item: _T) -> str | None

Add a multi-modal item to the current prompt and returns the placeholder string to use, if any.

An optional uuid can be added which serves as a unique identifier of the media.

Note

prompt_embeds bypass MM-processor validation because they are pre-computed embeddings that do not go through any HF processor, encoder, or model-specific placeholder logic. The corresponding placeholder string is managed by the parser via _add_placeholder, so we return None here.

Source code in vllm/entrypoints/chat_utils.py

def add(self, modality: ModalityStr, item: _T) -> str | None:
    """
    Add a multi-modal item to the current prompt and returns the
    placeholder string to use, if any.

    An optional uuid can be added which serves as a unique identifier of the
    media.

    Note:
        `prompt_embeds` bypass MM-processor validation because they are
        pre-computed embeddings that do not go through any HF processor, encoder,
        or model-specific placeholder logic. The corresponding placeholder string is
        managed by the parser via `_add_placeholder`, so we return None here.
    """
    if modality == "prompt_embeds":
        self._items_by_modality["prompt_embeds"].append(item)
        return None

    input_modality = modality.replace("_embeds", "")
    original_modality = modality
    use_vision_chunk = (
        self.use_unified_vision_chunk_modality
        and original_modality in ["video", "image"]
    )

    # If use_unified_vision_chunk_modality is enabled,
    # map image/video to vision_chunk
    if use_vision_chunk:
        # To avoid validation fail
        # because models with use_unified_vision_chunk_modality=True
        # will only accept vision_chunk modality.
        input_modality = "vision_chunk"
        num_items = len(self._items_by_modality[input_modality]) + 1
    else:
        num_items = len(self._items_by_modality[original_modality]) + 1

    mm_config = self.model_config.multimodal_config
    if (
        mm_config is not None
        and mm_config.enable_mm_embeds
        and mm_config.get_limit_per_prompt(input_modality) == 0
        and original_modality.endswith("_embeds")
    ):
        # Skip validation: embeddings bypass limit when enable_mm_embeds=True
        pass
    else:
        self.mm_processor.info.validate_num_items(input_modality, num_items)

    # Track original modality for vision_chunk items
    if use_vision_chunk:
        self._items_by_modality[input_modality].append(item)  # type: ignore
        self._modality_order["vision_chunk"].append(original_modality)
    else:
        self._items_by_modality[original_modality].append(item)

    return self.model_cls.get_placeholder_str(modality, num_items)

ChatCompletionContentPartAudioEmbedsParam ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False):
    audio_embeds: str | dict[str, str] | None
    """
    The audio embeddings. It can be either:
    - A single base64 string representing a serialized torch tensor.
    - A dictionary where each value is a base64 string.
    """
    type: Required[Literal["audio_embeds"]]
    """The type of the content part."""
    uuid: str | None
    """
    User-provided UUID of a media. User must guarantee that it is properly
    generated and unique for different medias.
    """

audio_embeds `instance-attribute` ¶

audio_embeds: str | dict[str, str] | None

The audio embeddings. It can be either: - A single base64 string representing a serialized torch tensor. - A dictionary where each value is a base64 string.

type `instance-attribute` ¶

type: Required[Literal['audio_embeds']]

The type of the content part.

uuid `instance-attribute` ¶

uuid: str | None

User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias.

ChatCompletionContentPartAudioParam ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class ChatCompletionContentPartAudioParam(TypedDict, total=False):
    audio_url: Required[AudioURL]

    type: Required[Literal["audio_url"]]
    """The type of the content part."""

type `instance-attribute` ¶

type: Required[Literal['audio_url']]

The type of the content part.

ChatCompletionContentPartImageEmbedsParam ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
    image_embeds: str | dict[str, str] | None
    """
    The image embeddings. It can be either:
    - A single base64 string.
    - A dictionary where each value is a base64 string.
    """
    type: Required[Literal["image_embeds"]]
    """The type of the content part."""
    uuid: str | None
    """
    User-provided UUID of a media. User must guarantee that it is properly
    generated and unique for different medias.
    """

image_embeds `instance-attribute` ¶

image_embeds: str | dict[str, str] | None

The image embeddings. It can be either: - A single base64 string. - A dictionary where each value is a base64 string.

type `instance-attribute` ¶

type: Required[Literal['image_embeds']]

The type of the content part.

uuid `instance-attribute` ¶

uuid: str | None

User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias.

ChatCompletionContentPartPromptEmbedsParam ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class ChatCompletionContentPartPromptEmbedsParam(TypedDict, total=False):
    data: Required[str]
    """
    Base64-encoded bytes of a serialized `torch.Tensor` of shape
    `(num_tokens, hidden_size)`. The tensor's `dtype` and `hidden_size` must
    match the model's input embedding layer.
    """
    type: Required[Literal["prompt_embeds"]]
    """The type of the content part."""

data `instance-attribute` ¶

data: Required[str]

Base64-encoded bytes of a serialized torch.Tensor of shape (num_tokens, hidden_size). The tensor's dtype and hidden_size must match the model's input embedding layer.

type `instance-attribute` ¶

type: Required[Literal['prompt_embeds']]

The type of the content part.

ChatCompletionContentPartVideoParam ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class ChatCompletionContentPartVideoParam(TypedDict, total=False):
    video_url: Required[VideoURL]

    type: Required[Literal["video_url"]]
    """The type of the content part."""

type `instance-attribute` ¶

type: Required[Literal['video_url']]

The type of the content part.

ChatTemplateResolutionError ¶

Bases: ValueError

Raised when chat template resolution fails.

This is a subclass of ValueError for backward compatibility with existing exception handlers.

Source code in vllm/entrypoints/chat_utils.py

class ChatTemplateResolutionError(ValueError):
    """Raised when chat template resolution fails.

    This is a subclass of ValueError for backward compatibility with
    existing exception handlers.
    """

ConversationMessage ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class ConversationMessage(TypedDict, total=False):
    role: Required[str]
    """The role of the message's author."""

    content: str | None | list[dict[str, str]]
    """The contents of the message"""

    tool_call_id: str | None
    """Tool call that this message is responding to."""

    name: str | None
    """The name of the function to call"""

    tool_calls: list[ChatCompletionMessageToolCallParam] | None
    """The tool calls generated by the model, such as function calls."""

    reasoning: str | None
    """The reasoning content for interleaved thinking."""

    reasoning_content: str | None
    """Deprecated: The reasoning content for interleaved thinking."""

    tools: list[ChatCompletionFunctionToolParam] | None
    """The tools for developer role."""

    task: str | None
    """Model-specific task marker. Currently passed through for DeepSeek V4."""

content `instance-attribute` ¶

content: str | None | list[dict[str, str]]

The contents of the message

name `instance-attribute` ¶

name: str | None

The name of the function to call

reasoning `instance-attribute` ¶

reasoning: str | None

The reasoning content for interleaved thinking.

reasoning_content `instance-attribute` ¶

reasoning_content: str | None

Deprecated: The reasoning content for interleaved thinking.

role `instance-attribute` ¶

role: Required[str]

The role of the message's author.

task `instance-attribute` ¶

task: str | None

Model-specific task marker. Currently passed through for DeepSeek V4.

tool_call_id `instance-attribute` ¶

tool_call_id: str | None

Tool call that this message is responding to.

tool_calls `instance-attribute` ¶

tool_calls: list[ChatCompletionMessageToolCallParam] | None

The tool calls generated by the model, such as function calls.

tools `instance-attribute` ¶

tools: list[ChatCompletionFunctionToolParam] | None

The tools for developer role.

CustomChatCompletionContentPILImageParam ¶

Bases: TypedDict

A simpler version of the param that only accepts a PIL image.

Example: { "image_pil": ImageAsset('cherry_blossom').pil_image }

Source code in vllm/entrypoints/chat_utils.py

class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a PIL image.

    Example:
    {
        "image_pil": ImageAsset('cherry_blossom').pil_image
    }
    """

    image_pil: PILImage | None
    uuid: str | None
    """
    User-provided UUID of a media. User must guarantee that it is properly
    generated and unique for different medias.
    """

uuid `instance-attribute` ¶

uuid: str | None

User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias.

CustomChatCompletionContentSimpleAudioParam ¶

Bases: TypedDict

A simpler version of the param that only accepts a plain audio_url.

Example: { "audio_url": "https://example.com/audio.mp3" }

Source code in vllm/entrypoints/chat_utils.py

class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "audio_url": "https://example.com/audio.mp3"
    }
    """

    audio_url: str | None

CustomChatCompletionContentSimpleImageParam ¶

Bases: TypedDict

A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented.

Example: { "image_url": "https://example.com/image.jpg" }

Source code in vllm/entrypoints/chat_utils.py

class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.

    Example:
    {
        "image_url": "https://example.com/image.jpg"
    }
    """

    image_url: str | None
    uuid: str | None
    """
    User-provided UUID of a media. User must guarantee that it is properly
    generated and unique for different medias.
    """

uuid `instance-attribute` ¶

uuid: str | None

User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias.

CustomChatCompletionContentSimpleVideoParam ¶

Bases: TypedDict

A simpler version of the param that only accepts a plain audio_url.

Example: { "video_url": "https://example.com/video.mp4" }

Source code in vllm/entrypoints/chat_utils.py

class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
    """A simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "video_url": "https://example.com/video.mp4"
    }
    """

    video_url: str | None
    uuid: str | None
    """
    User-provided UUID of a media. User must guarantee that it is properly
    generated and unique for different medias.
    """

uuid `instance-attribute` ¶

uuid: str | None

User-provided UUID of a media. User must guarantee that it is properly generated and unique for different medias.

CustomChatCompletionContentToolReferenceParam ¶

Bases: TypedDict

A tool reference content param that only accepts a plain tool name.

Example: { "name": "get_weather", "type": "tool_reference" }

Source code in vllm/entrypoints/chat_utils.py

class CustomChatCompletionContentToolReferenceParam(TypedDict, total=False):
    """A tool reference content param that only accepts a plain tool name.

    Example:
    {
        "name": "get_weather",
        "type": "tool_reference"
    }
    """

    name: str
    """The name of the tool being referenced."""

    type: Literal["tool_reference"]
    """The content type."""

name `instance-attribute` ¶

name: str

The name of the tool being referenced.

type `instance-attribute` ¶

type: Literal['tool_reference']

The content type.

CustomChatCompletionMessageParam ¶

Bases: TypedDict

Enables custom roles in the Chat Completion API.

Source code in vllm/entrypoints/chat_utils.py

class CustomChatCompletionMessageParam(TypedDict, total=False):
    """Enables custom roles in the Chat Completion API."""

    role: Required[str]
    """The role of the message's author."""

    content: str | list[ChatCompletionContentPartParam]
    """The contents of the message."""

    name: str
    """An optional name for the participant.

    Provides the model information to differentiate between participants of the
    same role.
    """

    tool_call_id: str | None
    """Tool call that this message is responding to."""

    tool_calls: list[ChatCompletionMessageToolCallParam] | None
    """The tool calls generated by the model, such as function calls."""

    reasoning: str | None
    """The reasoning content for interleaved thinking."""

    tools: list[ChatCompletionFunctionToolParam] | None
    """The tools for developer role."""

    task: str | None
    """Model-specific task marker. Currently passed through for DeepSeek V4."""

content `instance-attribute` ¶

content: str | list[ChatCompletionContentPartParam]

The contents of the message.

name `instance-attribute` ¶

name: str

An optional name for the participant.

Provides the model information to differentiate between participants of the same role.

reasoning `instance-attribute` ¶

reasoning: str | None

The reasoning content for interleaved thinking.

role `instance-attribute` ¶

role: Required[str]

The role of the message's author.

task `instance-attribute` ¶

task: str | None

Model-specific task marker. Currently passed through for DeepSeek V4.

tool_call_id `instance-attribute` ¶

tool_call_id: str | None

Tool call that this message is responding to.

tool_calls `instance-attribute` ¶

tool_calls: list[ChatCompletionMessageToolCallParam] | None

The tool calls generated by the model, such as function calls.

tools `instance-attribute` ¶

tools: list[ChatCompletionFunctionToolParam] | None

The tools for developer role.

CustomThinkCompletionContentParam ¶

Bases: TypedDict

A Think Completion Content Param that accepts a plain text and a boolean.

Example: { "thinking": "I am thinking about the answer", "closed": True, "type": "thinking" }

Source code in vllm/entrypoints/chat_utils.py

class CustomThinkCompletionContentParam(TypedDict, total=False):
    """A Think Completion Content Param that accepts a plain text and a boolean.

    Example:
    {
        "thinking": "I am thinking about the answer",
        "closed": True,
        "type": "thinking"
    }
    """

    thinking: Required[str]
    """The thinking content."""

    closed: bool
    """Whether the thinking is closed."""

    type: Required[Literal["thinking"]]
    """The thinking type."""

closed `instance-attribute` ¶

closed: bool

Whether the thinking is closed.

thinking `instance-attribute` ¶

thinking: Required[str]

The thinking content.

type `instance-attribute` ¶

type: Required[Literal['thinking']]

The thinking type.

MultiModalContentParser ¶

Bases: BaseMultiModalContentParser

Source code in vllm/entrypoints/chat_utils.py

class MultiModalContentParser(BaseMultiModalContentParser):
    def __init__(
        self,
        tracker: MultiModalItemTracker,
        mm_processor_kwargs: dict[str, Any] | None = None,
    ) -> None:
        super().__init__()

        self._tracker = tracker

        self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
            envs.VLLM_MEDIA_CONNECTOR,
            media_io_kwargs=tracker.media_io_kwargs,
            allowed_local_media_path=tracker.allowed_local_media_path,
            allowed_media_domains=tracker.allowed_media_domains,
        )

        self._mm_processor_kwargs = mm_processor_kwargs

    @property
    def model_config(self) -> ModelConfig:
        return self._tracker.model_config

    @override
    def parse_prompt_embeds(self, data: str) -> None:
        """Decode a base64 prompt embeds tensor and store it in the tracker.

        Emits a single `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` sentinel per
        content part. The renderer later expands each sentinel to a span of
        `tensor.shape[0]` placeholder tokens after tokenization.
        """
        if not self.model_config.enable_prompt_embeds:
            raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)

        tensor = safe_load_prompt_embeds(self.model_config, data.encode())
        self._tracker.add("prompt_embeds", (tensor, None))
        self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)

    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
        image = self._connector.fetch_image(image_url) if image_url else None

        placeholder = self._tracker.add("image", (image, uuid))
        self._add_placeholder("image", placeholder)

    def parse_image_embeds(
        self,
        image_embeds: str | dict[str, str] | None,
        uuid: str | None = None,
    ) -> None:
        mm_config = self.model_config.get_multimodal_config()
        if not mm_config.enable_mm_embeds:
            raise ValueError(
                "You must set `--enable-mm-embeds` to input `image_embeds`"
            )

        if isinstance(image_embeds, dict):
            embeds = {
                k: self._connector.fetch_image_embedding(v)
                for k, v in image_embeds.items()
            }
            placeholder = self._tracker.add("image_embeds", (embeds, uuid))

        if isinstance(image_embeds, str):
            embedding = self._connector.fetch_image_embedding(image_embeds)
            placeholder = self._tracker.add("image_embeds", (embedding, uuid))

        if image_embeds is None:
            placeholder = self._tracker.add("image_embeds", (None, uuid))

        self._add_placeholder("image", placeholder)

    def parse_audio_embeds(
        self,
        audio_embeds: str | dict[str, str] | None,
        uuid: str | None = None,
    ) -> None:
        mm_config = self.model_config.get_multimodal_config()
        if not mm_config.enable_mm_embeds:
            raise ValueError(
                "You must set `--enable-mm-embeds` to input `audio_embeds`"
            )

        if isinstance(audio_embeds, dict):
            embeds = {
                k: self._connector.fetch_audio_embedding(v)
                for k, v in audio_embeds.items()
            }
            placeholder = self._tracker.add("audio_embeds", (embeds, uuid))
        elif isinstance(audio_embeds, str):
            embedding = self._connector.fetch_audio_embedding(audio_embeds)
            placeholder = self._tracker.add("audio_embeds", (embedding, uuid))
        else:
            placeholder = self._tracker.add("audio_embeds", (None, uuid))

        self._add_placeholder("audio", placeholder)

    def parse_image_pil(
        self, image_pil: Image.Image | None, uuid: str | None = None
    ) -> None:
        placeholder = self._tracker.add("image", (image_pil, uuid))
        self._add_placeholder("image", placeholder)

    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
        audio = self._connector.fetch_audio(audio_url) if audio_url else None

        placeholder = self._tracker.add("audio", (audio, uuid))
        self._add_placeholder("audio", placeholder)

    def parse_input_audio(
        self, input_audio: InputAudio | None, uuid: str | None = None
    ) -> None:
        if input_audio:
            audio_data = input_audio.get("data", "")
            audio_format = input_audio.get("format", "")
            if audio_data:
                audio_url = f"data:audio/{audio_format};base64,{audio_data}"
            else:
                # If a UUID is provided, audio data may be empty.
                audio_url = None
        else:
            audio_url = None

        return self.parse_audio(audio_url, uuid)

    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
        video = self._connector.fetch_video(video_url=video_url) if video_url else None

        placeholder = self._tracker.add("video", (video, uuid))
        self._add_placeholder("video", placeholder)

        # Extract audio from video if use_audio_in_video is True
        if (
            video_url
            and self._mm_processor_kwargs
            and self._mm_processor_kwargs.get("use_audio_in_video", False)
        ):
            audio = self._connector.fetch_audio(video_url) if video_url else None
            audio_placeholder = self._tracker.add("audio", (audio, uuid))
            self._add_placeholder("audio", audio_placeholder)

parse_prompt_embeds ¶

parse_prompt_embeds(data: str) -> None

Decode a base64 prompt embeds tensor and store it in the tracker.

Emits a single PROMPT_EMBEDS_PLACEHOLDER_TOKEN sentinel per content part. The renderer later expands each sentinel to a span of tensor.shape[0] placeholder tokens after tokenization.

Source code in vllm/entrypoints/chat_utils.py

@override
def parse_prompt_embeds(self, data: str) -> None:
    """Decode a base64 prompt embeds tensor and store it in the tracker.

    Emits a single `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` sentinel per
    content part. The renderer later expands each sentinel to a span of
    `tensor.shape[0]` placeholder tokens after tokenization.
    """
    if not self.model_config.enable_prompt_embeds:
        raise ValueError(_ENABLE_PROMPT_EMBEDS_ERROR)

    tensor = safe_load_prompt_embeds(self.model_config, data.encode())
    self._tracker.add("prompt_embeds", (tensor, None))
    self._add_placeholder("prompt_embeds", PROMPT_EMBEDS_PLACEHOLDER_TOKEN)

PILImage ¶

Bases: BaseModel

A PIL.Image.Image object.

Source code in vllm/entrypoints/chat_utils.py

class PILImage(BaseModel):
    """
    A PIL.Image.Image object.
    """

    image_pil: Image.Image
    model_config = ConfigDict(arbitrary_types_allowed=True)

VideoURL ¶

Bases: TypedDict

Source code in vllm/entrypoints/chat_utils.py

class VideoURL(TypedDict, total=False):
    url: Required[str]
    """
    Either a URL of the video or a data URL with base64 encoded video data.
    """

url `instance-attribute` ¶

url: Required[str]

Either a URL of the video or a data URL with base64 encoded video data.

_get_full_multimodal_text_prompt ¶

_get_full_multimodal_text_prompt(
    placeholder_storage: dict[str, list],
    texts: list[str],
    interleave_strings: bool,
    multimodal_content_part_separator: str = "\n",
) -> str

Combine multimodal prompts for a multimodal language model.

Source code in vllm/entrypoints/chat_utils.py

def _get_full_multimodal_text_prompt(
    placeholder_storage: dict[str, list],
    texts: list[str],
    interleave_strings: bool,
    multimodal_content_part_separator: str = "\n",
) -> str:
    """Combine multimodal prompts for a multimodal language model."""

    # flatten storage to make it looks like
    # {
    #   "<|image|>": 2,
    #   "<|audio|>": 1
    # }
    placeholder_counts = Counter(
        [v for elem in placeholder_storage.values() for v in elem]
    )

    if interleave_strings:
        text_prompt = _get_interleaved_text_prompt(placeholder_storage, texts)
    else:
        text_prompt = "\n".join(texts)

    # Pass interleaved text further in case the user used image placeholders
    # himself, but forgot to disable the 'interleave_strings' flag

    # Look through the text prompt to check for missing placeholders
    missing_placeholders: list[str] = []
    for placeholder in placeholder_counts:
        # For any existing placeholder in the text prompt, we leave it as is
        placeholder_counts[placeholder] -= text_prompt.count(placeholder)

        if placeholder_counts[placeholder] < 0:
            logger.error(
                "Placeholder count is negative! "
                "Ensure that the 'interleave_strings' flag is disabled "
                "(current value: %s) "
                "when manually placing image placeholders.",
                interleave_strings,
            )
            logger.debug("Input prompt: %s", text_prompt)
            raise ValueError(
                f"Found more '{placeholder}' placeholders in input prompt than "
                "actual multimodal data items."
            )

        missing_placeholders.extend([placeholder] * placeholder_counts[placeholder])

    # NOTE: Default behaviour: we always add missing placeholders
    # at the front of the prompt, if interleave_strings=False
    if text_prompt:
        return multimodal_content_part_separator.join(
            missing_placeholders + [text_prompt]
        )
    else:
        return multimodal_content_part_separator.join(missing_placeholders)

_parse_chat_message_content_mm_part ¶

_parse_chat_message_content_mm_part(
    part: ChatCompletionContentPartParam,
) -> tuple[str, _ContentPart]

Parses a given multi-modal content part based on its type.

Parameters:

Name	Type	Description	Default
`part`	`ChatCompletionContentPartParam`	A dict containing the content part, with a potential 'type' field.	required

Returns:

Type	Description
`str`	A tuple (part_type, content) where:
`_ContentPart`	part_type: Type of the part (e.g., 'text', 'image_url').
`tuple[str, _ContentPart]`	content: Parsed content (e.g., text, image URL).

Raises:

Type	Description
`ValueError`	If the 'type' field is missing and no direct URL is found.

Source code in vllm/entrypoints/chat_utils.py

def _parse_chat_message_content_mm_part(
    part: ChatCompletionContentPartParam,
) -> tuple[str, _ContentPart]:
    """
    Parses a given multi-modal content part based on its type.

    Args:
        part: A dict containing the content part, with a potential 'type' field.

    Returns:
        A tuple (part_type, content) where:
        - part_type: Type of the part (e.g., 'text', 'image_url').
        - content: Parsed content (e.g., text, image URL).

    Raises:
        ValueError: If the 'type' field is missing and no direct URL is found.
    """
    assert isinstance(
        part, dict
    )  # This is needed to avoid mypy errors: part.get() from str
    part_type = part.get("type", None)
    uuid = part.get("uuid", None)

    if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None:  # noqa: E501
        content = MM_PARSER_MAP[part_type](part)

        # Special case for 'image_url.detail'
        # We only support 'auto', which is the default
        if part_type == "image_url" and part.get("detail", "auto") != "auto":
            logger.warning(
                "'image_url.detail' is currently not supported and will be ignored."
            )

        return part_type, content

    # Handle missing 'type' but provided direct URL fields.
    # 'type' is required field by pydantic
    if part_type is None or uuid is not None:
        if "image_url" in part:
            image_params = cast(CustomChatCompletionContentSimpleImageParam, part)
            image_url = image_params.get("image_url", None)
            if isinstance(image_url, dict):
                # Can potentially happen if user provides a uuid
                # with url as a dict of {"url": url}
                image_url = image_url.get("url", None)
            return "image_url", image_url
        if "image_pil" in part:
            # "image_pil" could be None if UUID is provided.
            image_params = cast(  # type: ignore
                CustomChatCompletionContentPILImageParam, part
            )
            image_pil = image_params.get("image_pil", None)
            return "image_pil", image_pil
        if "image_embeds" in part:
            # "image_embeds" could be None if UUID is provided.
            image_params = cast(  # type: ignore
                ChatCompletionContentPartImageEmbedsParam, part
            )
            image_embeds = image_params.get("image_embeds", None)
            return "image_embeds", image_embeds
        if "audio_embeds" in part:
            # "audio_embeds" could be None if UUID is provided.
            audio_params = cast(  # type: ignore[assignment]
                ChatCompletionContentPartAudioEmbedsParam, part
            )
            audio_embeds = audio_params.get("audio_embeds", None)
            return "audio_embeds", audio_embeds
        if "prompt_embeds" in part:
            prompt_embeds_params = cast(  # type: ignore[assignment]
                ChatCompletionContentPartPromptEmbedsParam, part
            )
            return "prompt_embeds", prompt_embeds_params.get("data", None)
        if "audio_url" in part:
            audio_params = cast(  # type: ignore[assignment]
                CustomChatCompletionContentSimpleAudioParam, part
            )
            audio_url = audio_params.get("audio_url", None)
            if isinstance(audio_url, dict):
                # Can potentially happen if user provides a uuid
                # with url as a dict of {"url": url}
                audio_url = audio_url.get("url", None)
            return "audio_url", audio_url
        if part.get("input_audio") is not None:
            input_audio_params = cast(dict[str, str], part)
            return "input_audio", input_audio_params
        if "video_url" in part:
            video_params = cast(CustomChatCompletionContentSimpleVideoParam, part)
            video_url = video_params.get("video_url", None)
            if isinstance(video_url, dict):
                # Can potentially happen if user provides a uuid
                # with url as a dict of {"url": url}
                video_url = video_url.get("url", None)
            return "video_url", video_url
        if "tool_reference" in part:
            tool_reference_params = cast(
                CustomChatCompletionContentToolReferenceParam, part
            )
            tool_reference = tool_reference_params.get("name", None)
            return "tool_reference", tool_reference
        # Raise an error if no 'type' or direct URL is found.
        raise ValueError("Missing 'type' field in multimodal part.")

    if not isinstance(part_type, str):
        raise ValueError("Invalid 'type' field in multimodal part.")
    return part_type, "unknown part_type content"

_parse_chat_message_content_part ¶

_parse_chat_message_content_part(
    part: ChatCompletionContentPartParam,
    mm_parser: BaseMultiModalContentParser,
    *,
    wrap_dicts: bool,
    interleave_strings: bool,
) -> _ContentPart | None

Parses a single part of a conversation. If wrap_dicts is True, structured dictionary pieces for texts and images will be wrapped in dictionaries, i.e., {"type": "text", "text", ...} and {"type": "image"}, respectively. Otherwise multimodal data will be handled by mm_parser, and texts will be returned as strings to be joined with multimodal placeholders.

Source code in vllm/entrypoints/chat_utils.py

def _parse_chat_message_content_part(
    part: ChatCompletionContentPartParam,
    mm_parser: BaseMultiModalContentParser,
    *,
    wrap_dicts: bool,
    interleave_strings: bool,
) -> _ContentPart | None:
    """Parses a single part of a conversation. If wrap_dicts is True,
    structured dictionary pieces for texts and images will be
    wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
    {"type": "image"}, respectively. Otherwise multimodal data will be
    handled by mm_parser, and texts will be returned as strings to be joined
    with multimodal placeholders.
    """
    if isinstance(part, str):  # Handle plain text parts
        _reject_reserved_placeholder_in_text(part, mm_parser.model_config)
        if wrap_dicts:
            return {"type": "text", "text": part}
        return part
    # Handle structured dictionary parts
    part_type, content = _parse_chat_message_content_mm_part(part)
    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
    # content is None, log a warning and skip
    if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None:
        logger.warning(
            "Skipping multimodal part '%s' (type: '%s') "
            "with empty / unparsable content.",
            part,
            part_type,
        )
        return None

    if part_type in ("text", "input_text", "output_text", "refusal", "thinking"):
        str_content = cast(str, content)
        _reject_reserved_placeholder_in_text(str_content, mm_parser.model_config)
        if wrap_dicts:
            return {"type": "text", "text": str_content}
        else:
            return str_content

    # For media items, if a user has provided one, use it. Otherwise, insert
    # a placeholder empty uuid.
    uuid = part.get("uuid", None)
    if uuid is not None:
        uuid = str(uuid)

    modality = None
    if part_type == "image_pil":
        image_content = cast(Image.Image, content) if content is not None else None
        mm_parser.parse_image_pil(image_content, uuid)
        modality = "image"
    elif part_type in ("image_url", "input_image"):
        str_content = cast(str, content)
        mm_parser.parse_image(str_content, uuid)
        modality = "image"
    elif part_type == "image_embeds":
        content = cast(str | dict[str, str], content) if content is not None else None
        mm_parser.parse_image_embeds(content, uuid)
        modality = "image"
    elif part_type == "audio_embeds":
        content = cast(str | dict[str, str], content) if content is not None else None
        mm_parser.parse_audio_embeds(content, uuid)
        modality = "audio"
    elif part_type == "prompt_embeds":
        if not content:
            raise ValueError(_PROMPT_EMBEDS_MISSING_DATA_ERROR)
        mm_parser.parse_prompt_embeds(cast(str, content))
        modality = "prompt_embeds"
    elif part_type == "audio_url":
        str_content = cast(str, content)
        mm_parser.parse_audio(str_content, uuid)
        modality = "audio"
    elif part_type == "input_audio":
        dict_content = cast(InputAudio, content)
        mm_parser.parse_input_audio(dict_content, uuid)
        modality = "audio"
    elif part_type == "video_url":
        str_content = cast(str, content)
        mm_parser.parse_video(str_content, uuid)
        modality = "video"
    elif part_type == "tool_reference":
        # Tool references are not multimodal data — they reference deferred
        # tools and are passed through as-is for the chat template to expand.
        if wrap_dicts:
            return {"type": "tool_reference", "name": cast(str, content)}
        return cast(str, content)
    else:
        supported = sorted(MM_PARSER_MAP.keys() | set(PART_TYPES_TO_SKIP_NONE_CONTENT))
        raise VLLMValidationError(
            f"Unsupported chat content part type: {part_type!r}. "
            f"Supported types: {', '.join(supported)}.",
            parameter="type",
            value=part_type,
        )

    if wrap_dicts:
        if modality == "prompt_embeds":
            # Chat templates don't know about the "prompt_embeds" modality,
            # emit the single sentinel token as text so the template renders
            # it inline. The renderer later expands it to N tokens post-tokenize.
            return {"type": "text", "text": PROMPT_EMBEDS_PLACEHOLDER_TOKEN}
        return {"type": modality}
    if modality == "prompt_embeds":
        # Emit the renderer token inline regardless of `interleave_strings`,
        # prompt_embeds are spliced at the token offset so position matters.
        # Falling back to front-padding via `missing_placeholders` would
        # reorder them relative to surrounding text.
        return PROMPT_EMBEDS_PLACEHOLDER_TOKEN
    return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None

_reject_reserved_placeholder_in_text ¶

_reject_reserved_placeholder_in_text(
    text: str, model_config: ModelConfig
) -> None

Reject user-supplied text parts that contains the reserved prompt_embeds placeholder sentinel.

When the server accepts prompt_embeds, the placeholder token is registered as a single unsplittable special token on the tokenizer. Any user text that happens to contain the literal sequence would tokenize to the same ID and be mistaken for a splice point by the renderer, letting a caller move or inject splice positions via plain text content.

Source code in vllm/entrypoints/chat_utils.py

def _reject_reserved_placeholder_in_text(text: str, model_config: ModelConfig) -> None:
    """Reject user-supplied text parts that contains the reserved `prompt_embeds`
    placeholder sentinel.

    When the server accepts `prompt_embeds`, the placeholder token is
    registered as a single unsplittable special token on the tokenizer. Any
    user text that happens to contain the literal sequence would tokenize to
    the same ID and be mistaken for a splice point by the renderer, letting a
    caller move or inject splice positions via plain text content.
    """
    if model_config.enable_prompt_embeds and PROMPT_EMBEDS_PLACEHOLDER_TOKEN in text:
        raise ValueError(
            _RESERVED_PLACEHOLDER_IN_TEXT_ERROR.format(
                token=PROMPT_EMBEDS_PLACEHOLDER_TOKEN
            )
        )

_resolve_items ¶

_resolve_items(
    items_by_modality: dict[
        str, list[tuple[object, str | None]]
    ],
    mm_processor: BaseMultiModalProcessor | None,
    modality_order: dict[str, list[str]],
) -> tuple[MultiModalDataDict, MultiModalUUIDDict]

Materialize the tracker's per-modality items into mm_data / mm_uuids.

Note

mm_processor is None for text-only models (no registered HF processor) whose only modality is prompt_embeds. Every other modality requires a processor, enforced by the guard below.

Source code in vllm/entrypoints/chat_utils.py

def _resolve_items(
    items_by_modality: dict[str, list[tuple[object, str | None]]],
    mm_processor: BaseMultiModalProcessor | None,
    modality_order: dict[str, list[str]],
) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
    """
    Materialize the tracker's per-modality items into `mm_data` / `mm_uuids`.

    Note:
        `mm_processor` is `None` for text-only models (no registered HF
        processor) whose only modality is `prompt_embeds`. Every other
        modality requires a processor, enforced by the guard below.
    """
    if "image" in items_by_modality and "image_embeds" in items_by_modality:
        raise ValueError("Mixing raw image and embedding inputs is not allowed")
    if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
        raise ValueError("Mixing raw audio and embedding inputs is not allowed")
    # `prompt_embeds` bypasses HF MM processors. Every other modality requires one.
    processor_modalities = items_by_modality.keys() - {"prompt_embeds"}
    if processor_modalities and mm_processor is None:
        raise RuntimeError(
            _REQUIRE_MM_PROCESSOR_ERROR.format(modality=processor_modalities)
        )

    mm_data = {}
    mm_uuids = {}
    if "image_embeds" in items_by_modality:
        assert mm_processor is not None
        mm_data["image"] = _get_embeds_data(
            "image",
            [data for data, uuid in items_by_modality["image_embeds"]],
            mm_processor,
        )
        mm_uuids["image"] = [uuid for data, uuid in items_by_modality["image_embeds"]]
    if "image" in items_by_modality:
        mm_data["image"] = [data for data, uuid in items_by_modality["image"]]
        mm_uuids["image"] = [uuid for data, uuid in items_by_modality["image"]]
    if "audio_embeds" in items_by_modality:
        assert mm_processor is not None
        mm_data["audio"] = _get_embeds_data(
            "audio",
            [data for data, uuid in items_by_modality["audio_embeds"]],
            mm_processor,
        )
        mm_uuids["audio"] = [uuid for data, uuid in items_by_modality["audio_embeds"]]
    if "audio" in items_by_modality:
        mm_data["audio"] = [data for data, uuid in items_by_modality["audio"]]
        mm_uuids["audio"] = [uuid for data, uuid in items_by_modality["audio"]]
    if "video" in items_by_modality:
        mm_data["video"] = [data for data, uuid in items_by_modality["video"]]
        mm_uuids["video"] = [uuid for data, uuid in items_by_modality["video"]]
    if "vision_chunk" in items_by_modality:
        assert mm_processor is not None
        # Process vision_chunk items - extract from (data, modality) tuples
        # and convert to VisionChunk types with proper UUID handling
        processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items(
            items_by_modality["vision_chunk"],
            mm_processor,
            modality_order.get("vision_chunk", []),
        )
        mm_data["vision_chunk"] = processed_chunks
        mm_uuids["vision_chunk"] = vision_chunk_uuids
    if "prompt_embeds" in items_by_modality:
        mm_data["prompt_embeds"] = [
            data for data, _uuid in items_by_modality["prompt_embeds"]
        ]

    return mm_data, mm_uuids

get_tool_call_id_type ¶

get_tool_call_id_type(model_config: ModelConfig) -> str

Return the tool-call ID type for a given model configuration.

Source code in vllm/entrypoints/chat_utils.py

def get_tool_call_id_type(model_config: ModelConfig) -> str:
    """Return the tool-call ID type for a given model configuration."""
    hf_overrides = getattr(model_config, "hf_overrides", None)
    if model_config.hf_text_config.model_type in _KIMI_MODEL_TYPES or (
        isinstance(hf_overrides, dict)
        and hf_overrides.get("model_type") in _KIMI_MODEL_TYPES
    ):
        return "kimi_k2"
    return "random"

validate_chat_template ¶

validate_chat_template(chat_template: Path | str | None)

Raises if the provided chat template appears invalid.

Source code in vllm/entrypoints/chat_utils.py

def validate_chat_template(chat_template: Path | str | None):
    """Raises if the provided chat template appears invalid."""
    if chat_template is None:
        return

    elif isinstance(chat_template, Path) and not chat_template.exists():
        raise FileNotFoundError("the supplied chat template path doesn't exist")

    elif isinstance(chat_template, str):
        JINJA_CHARS = "{}\n"
        if (
            not any(c in chat_template for c in JINJA_CHARS)
            and not Path(chat_template).exists()
        ):
            # Try to find the template in the built-in templates directory
            from vllm.transformers_utils.chat_templates.registry import (
                CHAT_TEMPLATES_DIR,
            )

            builtin_template_path = CHAT_TEMPLATES_DIR / chat_template
            if not builtin_template_path.exists():
                raise ValueError(
                    f"The supplied chat template string ({chat_template}) "
                    f"appears path-like, but doesn't exist! "
                    f"Tried: {chat_template} and {builtin_template_path}"
                )

    else:
        raise TypeError(f"{type(chat_template)} is not a valid chat template type")

vllm.entrypoints.chat_utils ¶

PROMPT_EMBEDS_PLACEHOLDER_TOKEN module-attribute ¶

AsyncMultiModalContentParser ¶

parse_prompt_embeds ¶

AudioURL ¶

url instance-attribute ¶

BaseMultiModalItemTracker ¶

use_unified_vision_chunk_modality cached property ¶

add ¶

ChatCompletionContentPartAudioEmbedsParam ¶

audio_embeds instance-attribute ¶

type instance-attribute ¶

uuid instance-attribute ¶

ChatCompletionContentPartAudioParam ¶

type instance-attribute ¶

ChatCompletionContentPartImageEmbedsParam ¶

image_embeds instance-attribute ¶

type instance-attribute ¶

uuid instance-attribute ¶

ChatCompletionContentPartPromptEmbedsParam ¶

data instance-attribute ¶

type instance-attribute ¶

ChatCompletionContentPartVideoParam ¶

type instance-attribute ¶

ChatTemplateResolutionError ¶

ConversationMessage ¶

content instance-attribute ¶

name instance-attribute ¶

reasoning instance-attribute ¶

reasoning_content instance-attribute ¶

role instance-attribute ¶

task instance-attribute ¶

tool_call_id instance-attribute ¶

tool_calls instance-attribute ¶

tools instance-attribute ¶

CustomChatCompletionContentPILImageParam ¶

uuid instance-attribute ¶

CustomChatCompletionContentSimpleAudioParam ¶

CustomChatCompletionContentSimpleImageParam ¶

uuid instance-attribute ¶

CustomChatCompletionContentSimpleVideoParam ¶

uuid instance-attribute ¶

CustomChatCompletionContentToolReferenceParam ¶

name instance-attribute ¶

type instance-attribute ¶

CustomChatCompletionMessageParam ¶

content instance-attribute ¶

name instance-attribute ¶

reasoning instance-attribute ¶

role instance-attribute ¶

task instance-attribute ¶

tool_call_id instance-attribute ¶

tool_calls instance-attribute ¶

tools instance-attribute ¶

CustomThinkCompletionContentParam ¶

closed instance-attribute ¶

thinking instance-attribute ¶

type instance-attribute ¶

MultiModalContentParser ¶

parse_prompt_embeds ¶

PILImage ¶

VideoURL ¶

url instance-attribute ¶

_get_full_multimodal_text_prompt ¶

_parse_chat_message_content_mm_part ¶

_parse_chat_message_content_part ¶

_reject_reserved_placeholder_in_text ¶

_resolve_items ¶

get_tool_call_id_type ¶

validate_chat_template ¶

PROMPT_EMBEDS_PLACEHOLDER_TOKEN `module-attribute` ¶

url `instance-attribute` ¶

use_unified_vision_chunk_modality `cached` `property` ¶

audio_embeds `instance-attribute` ¶

type `instance-attribute` ¶

uuid `instance-attribute` ¶

type `instance-attribute` ¶

image_embeds `instance-attribute` ¶

type `instance-attribute` ¶

uuid `instance-attribute` ¶

data `instance-attribute` ¶

type `instance-attribute` ¶

type `instance-attribute` ¶

content `instance-attribute` ¶

name `instance-attribute` ¶

reasoning `instance-attribute` ¶

reasoning_content `instance-attribute` ¶

role `instance-attribute` ¶

task `instance-attribute` ¶

tool_call_id `instance-attribute` ¶

tool_calls `instance-attribute` ¶

tools `instance-attribute` ¶

uuid `instance-attribute` ¶

uuid `instance-attribute` ¶

uuid `instance-attribute` ¶

name `instance-attribute` ¶

type `instance-attribute` ¶

content `instance-attribute` ¶

name `instance-attribute` ¶

reasoning `instance-attribute` ¶

role `instance-attribute` ¶

task `instance-attribute` ¶

tool_call_id `instance-attribute` ¶

tool_calls `instance-attribute` ¶

tools `instance-attribute` ¶

closed `instance-attribute` ¶

thinking `instance-attribute` ¶

type `instance-attribute` ¶

url `instance-attribute` ¶