vllm.renderers.hf ¶

_PROCESSOR_CHAT_TEMPLATES `module-attribute` ¶

_PROCESSOR_CHAT_TEMPLATES = dict[
    tuple[str, bool], str | None
]()

Used in _try_get_processor_chat_template to avoid calling cached_get_processor again if the processor fails to be loaded.

This is needed because lru_cache does not cache when an exception happens.

HfRenderer ¶

Bases: BaseRenderer[HfTokenizer]

Source code in vllm/renderers/hf.py

class HfRenderer(BaseRenderer[HfTokenizer]):
    def __init__(
        self,
        config: VllmConfig,
        tokenizer: HfTokenizer | None,
    ) -> None:
        super().__init__(config, tokenizer)

        self.use_unified_vision_chunk = getattr(
            config.model_config.hf_config, "use_unified_vision_chunk", False
        )

        self._apply_chat_template_async = make_async(
            safe_apply_chat_template, executor=self._executor
        )

    def render_messages(
        self,
        messages: list[ChatCompletionMessageParam],
        params: ChatParams,
    ) -> tuple[list[ConversationMessage], DictPrompt]:
        model_config = self.model_config
        tokenizer = self.get_tokenizer()

        prompt_embeds_placeholder_token_id: int | None = None
        if model_config.enable_prompt_embeds:
            prompt_embeds_placeholder_token_id = (
                _ensure_prompt_embeds_placeholder_token(tokenizer)
            )

        conversation, mm_data, mm_uuids = parse_chat_messages(
            messages,
            model_config,
            content_format=resolve_chat_template_content_format(
                chat_template=params.chat_template,
                tools=params.chat_template_kwargs.get("tools"),
                given_format=params.chat_template_content_format,
                tokenizer=tokenizer,
                model_config=model_config,
            ),
            media_io_kwargs=params.media_io_kwargs,
            mm_processor_kwargs=params.mm_processor_kwargs,
        )

        # prompt_embeds tensors are carried by the tracker through mm_data,
        # but they must NOT be fed to the MM processor (which would reject
        # the unknown key). Extract them here.
        prompt_embeds_tensors: list[torch.Tensor] | None = None
        if mm_data is not None and "prompt_embeds" in mm_data:
            prompt_embeds_tensors = list(
                cast(Sequence[torch.Tensor], mm_data["prompt_embeds"])
            )
            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
            if not mm_data:
                mm_data = None

        chat_template_kwargs = params.get_apply_chat_template_kwargs()
        if prompt_embeds_tensors:
            # prompt_embeds post-processing requires prompt_token_ids.
            if chat_template_kwargs.get("tokenize") is False:
                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
            chat_template_kwargs["tokenize"] = True

        prompt_raw = safe_apply_chat_template(
            model_config,
            tokenizer,
            conversation,
            **chat_template_kwargs,
        )

        # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
        # model which uses unified vision chunks for both images and videos.
        if (
            self.use_unified_vision_chunk
            and mm_uuids is not None
            and mm_data is not None
        ):
            mm_uuids = rebuild_mm_uuids_from_mm_data(mm_uuids, mm_data)

            # get video placeholder, replace it with runtime video-chunk prompts
            video_placeholder = getattr(
                model_config.hf_config, "video_placeholder", None
            )
            prompt_raw = cast(
                list[int],
                replace_vision_chunk_video_placeholder(
                    prompt_raw,
                    mm_data,
                    video_placeholder,
                ),
            )

        prompt = parse_dec_only_prompt(prompt_raw)

        # When `prompt_embeds` is mixed with other modality data,
        # `_process_tokens` runs `_process_multimodal` first (expanding
        # `<|AUDIO|>` / `<|IMAGE|>` placeholders) and then
        # `_apply_prompt_embeds_to_engine_input` augments the result.
        # Stash the tensors and placeholder ID for that override to consume.
        if prompt_embeds_tensors and mm_data:
            assert prompt_embeds_placeholder_token_id is not None
            cast(dict, prompt)["_prompt_embeds"] = (
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )
            if params.mm_processor_kwargs:
                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
        elif prompt_embeds_tensors:
            # Pure mode: no other MM data, mutate prompt to EmbedsPrompt shape.
            assert prompt_embeds_placeholder_token_id is not None
            self._apply_prompt_embeds_to_prompt(
                prompt,
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )

        if mm_data is not None:
            prompt["multi_modal_data"] = mm_data
        if mm_uuids is not None:
            prompt["multi_modal_uuids"] = mm_uuids

        return conversation, prompt

    async def render_messages_async(
        self,
        messages: list[ChatCompletionMessageParam],
        params: ChatParams,
    ) -> tuple[list[ConversationMessage], DictPrompt]:
        model_config = self.model_config
        tokenizer = self.get_tokenizer()

        prompt_embeds_placeholder_token_id: int | None = None
        if model_config.enable_prompt_embeds:
            prompt_embeds_placeholder_token_id = (
                _ensure_prompt_embeds_placeholder_token(tokenizer)
            )

        conversation, mm_data, mm_uuids = await parse_chat_messages_async(
            messages,
            model_config,
            content_format=resolve_chat_template_content_format(
                chat_template=params.chat_template,
                tools=params.chat_template_kwargs.get("tools"),
                given_format=params.chat_template_content_format,
                tokenizer=tokenizer,
                model_config=model_config,
            ),
            media_io_kwargs=params.media_io_kwargs,
            mm_processor_kwargs=params.mm_processor_kwargs,
        )

        prompt_embeds_tensors: list[torch.Tensor] | None = None
        if mm_data is not None and "prompt_embeds" in mm_data:
            prompt_embeds_tensors = list(
                cast(Sequence[torch.Tensor], mm_data["prompt_embeds"])
            )
            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
            if not mm_data:
                mm_data = None

        chat_template_kwargs = params.get_apply_chat_template_kwargs()
        if prompt_embeds_tensors:
            # prompt_embeds post-processing requires prompt_token_ids.
            if chat_template_kwargs.get("tokenize") is False:
                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
            chat_template_kwargs["tokenize"] = True

        prompt_raw = await self._apply_chat_template_async(
            model_config,
            tokenizer,
            conversation,
            **chat_template_kwargs,
        )

        # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
        # model which uses unified vision chunks for both images and videos.
        if (
            self.use_unified_vision_chunk
            and mm_uuids is not None
            and mm_data is not None
        ):
            # get video placeholder, replace it with runtime video-chunk prompts
            video_placeholder = getattr(
                model_config.hf_config, "video_placeholder", None
            )
            prompt_raw = cast(
                list[int],
                replace_vision_chunk_video_placeholder(
                    prompt_raw,
                    mm_data,
                    video_placeholder,
                ),
            )

        prompt = parse_dec_only_prompt(prompt_raw)

        # See `render_messages` for the rationale.
        if prompt_embeds_tensors and mm_data:
            assert prompt_embeds_placeholder_token_id is not None
            cast(dict, prompt)["_prompt_embeds"] = (
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )
            if params.mm_processor_kwargs:
                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
        elif prompt_embeds_tensors:
            assert prompt_embeds_placeholder_token_id is not None
            self._apply_prompt_embeds_to_prompt(
                prompt,
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )

        if mm_data is not None:
            prompt["multi_modal_data"] = mm_data
        if mm_uuids is not None:
            prompt["multi_modal_uuids"] = mm_uuids

        return conversation, prompt

    @override
    def _process_tokens(
        self,
        prompt: TokensPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> TokensInput | MultiModalInput:
        """Pre-expand `prompt_embeds` sentinels before delegating to the MM
        processor, then attach `prompt_embeds` modality data to the result.

        Mixed mode only: the `_prompt_embeds` stash is set by
        `render_messages` when `prompt_embeds` co-exist with other MM data
        (images, audio, …).  We expand each 1-token sentinel to an N-token
        span *before* calling `super()._process_tokens()` so the MM
        processor records all placeholder offsets in the final (post-expansion)
        coordinate space, no offset shifting needed afterwards.
        """
        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
        if prompt_embeds_info is not None:
            tensors, placeholder_token_id = prompt_embeds_info
            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
                list(prompt["prompt_token_ids"]), mm_updates
            )
        engine_input = super()._process_tokens(prompt, skip_mm_cache=skip_mm_cache)
        if prompt_embeds_info is not None:
            tensors, _ = prompt_embeds_info
            self._apply_prompt_embeds_to_engine_input(
                cast(MultiModalInput, engine_input),
                tensors,
                mm_updates,
            )
        return engine_input

    @override
    async def _process_tokens_async(
        self,
        prompt: TokensPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> TokensInput | MultiModalInput:
        """Async equivalent of `_process_tokens`."""
        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
        if prompt_embeds_info is not None:
            tensors, placeholder_token_id = prompt_embeds_info
            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
                list(prompt["prompt_token_ids"]), mm_updates
            )
        engine_input = await super()._process_tokens_async(
            prompt, skip_mm_cache=skip_mm_cache
        )
        if prompt_embeds_info is not None:
            tensors, _ = prompt_embeds_info
            self._apply_prompt_embeds_to_engine_input(
                cast(MultiModalInput, engine_input),
                tensors,
                mm_updates,
            )
        return engine_input

    @staticmethod
    def _apply_prompt_embeds_to_prompt(
        prompt: DictPrompt,
        prompt_embeds_tensors: list[torch.Tensor],
        placeholder_token_id: int,
    ) -> None:
        """Mutate `prompt` from `TokensPrompt` to `EmbedsPrompt` shape.

        Pure `prompt_embeds` path only (no other MM modalities).  Expands
        each `<prompt_embeds>` sentinel token into an N-token span and builds
        the full-length `prompt_embeds` tensor + `prompt_is_token_ids` mask
        that the engine's `enable_prompt_embeds` worker branch consumes.
        """
        token_ids = cast(list[int] | None, prompt.get("prompt_token_ids"))
        if token_ids is None:
            raise RuntimeError(_MISSING_PROMPT_TOKEN_IDS_ERROR)

        embeds_orig_positions: list[int] = [
            i for i, tok in enumerate(token_ids) if tok == placeholder_token_id
        ]
        if len(embeds_orig_positions) != len(prompt_embeds_tensors):
            raise ValueError(
                f"Expected {len(prompt_embeds_tensors)} prompt_embeds "
                f"placeholder tokens in the rendered prompt, found "
                f"{len(embeds_orig_positions)}."
            )

        mm_updates = _build_prompt_embeds_updates(
            prompt_embeds_tensors, placeholder_token_id
        )
        expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
        positions = _build_prompt_embeds_positions(
            expanded, len(prompt_embeds_tensors), mm_updates
        )

        embeds_prompt = cast(EmbedsPrompt, prompt)
        embeds_prompt["prompt_token_ids"] = expanded
        full_embeds, is_token_ids_mask = _build_mixed_prompt_embeds(
            expanded, prompt_embeds_tensors, positions
        )
        embeds_prompt["prompt_embeds"] = full_embeds
        embeds_prompt["prompt_is_token_ids"] = is_token_ids_mask

    @staticmethod
    def _apply_prompt_embeds_to_engine_input(
        engine_input: MultiModalInput,
        prompt_embeds_tensors: list[torch.Tensor],
        mm_updates: MultiModalPromptUpdates,
    ) -> None:
        """Augment `engine_input` in-place with a `prompt_embeds` modality.

        Mixed mode: called after `_process_multimodal` has already run on the
        pre-expanded token IDs (expansion was done in `_process_tokens` before
        calling `super()`).  Locates the already-expanded `prompt_embeds` spans
        and adds `prompt_embeds` entries to `mm_kwargs`, `mm_hashes`, and
        `mm_placeholders`.
        """
        # token_ids already contain the pre-expanded N-token spans.
        token_ids = list(engine_input["prompt_token_ids"])

        positions = _build_prompt_embeds_positions(
            token_ids, len(prompt_embeds_tensors), mm_updates
        )

        pe_kwargs_items: list[MultiModalKwargsItem] = []
        pe_hashes: list[str] = []
        pe_placeholders: list[PlaceholderRange] = []
        for tensor, (start, length) in zip(
            prompt_embeds_tensors, positions, strict=True
        ):
            pe_kwargs_items.append(
                MultiModalKwargsItem(
                    {
                        "embedding": MultiModalFieldElem(
                            data=tensor,
                            field=MultiModalSharedField(batch_size=1),
                        )
                    }
                )
            )
            pe_hashes.append(MultiModalHasher.hash_kwargs(prompt_embeds=tensor))
            # `is_embed=None` matches the existing image_embeds-style
            # "no encoder, just splice the tensor directly" semantics.
            pe_placeholders.append(
                PlaceholderRange(offset=start, length=length, is_embed=None)
            )

        cast(
            MultiModalKwargsItems[MultiModalKwargsItem | None],
            engine_input["mm_kwargs"],
        )["prompt_embeds"] = pe_kwargs_items
        engine_input["mm_hashes"] = {
            **engine_input["mm_hashes"],
            "prompt_embeds": pe_hashes,
        }
        cast(dict, engine_input["mm_placeholders"])["prompt_embeds"] = pe_placeholders

_apply_prompt_embeds_to_engine_input `staticmethod` ¶

_apply_prompt_embeds_to_engine_input(
    engine_input: MultiModalInput,
    prompt_embeds_tensors: list[Tensor],
    mm_updates: MultiModalPromptUpdates,
) -> None

Augment engine_input in-place with a prompt_embeds modality.

Mixed mode: called after _process_multimodal has already run on the pre-expanded token IDs (expansion was done in _process_tokens before calling super()). Locates the already-expanded prompt_embeds spans and adds prompt_embeds entries to mm_kwargs, mm_hashes, and mm_placeholders.

Source code in vllm/renderers/hf.py

@staticmethod
def _apply_prompt_embeds_to_engine_input(
    engine_input: MultiModalInput,
    prompt_embeds_tensors: list[torch.Tensor],
    mm_updates: MultiModalPromptUpdates,
) -> None:
    """Augment `engine_input` in-place with a `prompt_embeds` modality.

    Mixed mode: called after `_process_multimodal` has already run on the
    pre-expanded token IDs (expansion was done in `_process_tokens` before
    calling `super()`).  Locates the already-expanded `prompt_embeds` spans
    and adds `prompt_embeds` entries to `mm_kwargs`, `mm_hashes`, and
    `mm_placeholders`.
    """
    # token_ids already contain the pre-expanded N-token spans.
    token_ids = list(engine_input["prompt_token_ids"])

    positions = _build_prompt_embeds_positions(
        token_ids, len(prompt_embeds_tensors), mm_updates
    )

    pe_kwargs_items: list[MultiModalKwargsItem] = []
    pe_hashes: list[str] = []
    pe_placeholders: list[PlaceholderRange] = []
    for tensor, (start, length) in zip(
        prompt_embeds_tensors, positions, strict=True
    ):
        pe_kwargs_items.append(
            MultiModalKwargsItem(
                {
                    "embedding": MultiModalFieldElem(
                        data=tensor,
                        field=MultiModalSharedField(batch_size=1),
                    )
                }
            )
        )
        pe_hashes.append(MultiModalHasher.hash_kwargs(prompt_embeds=tensor))
        # `is_embed=None` matches the existing image_embeds-style
        # "no encoder, just splice the tensor directly" semantics.
        pe_placeholders.append(
            PlaceholderRange(offset=start, length=length, is_embed=None)
        )

    cast(
        MultiModalKwargsItems[MultiModalKwargsItem | None],
        engine_input["mm_kwargs"],
    )["prompt_embeds"] = pe_kwargs_items
    engine_input["mm_hashes"] = {
        **engine_input["mm_hashes"],
        "prompt_embeds": pe_hashes,
    }
    cast(dict, engine_input["mm_placeholders"])["prompt_embeds"] = pe_placeholders

_apply_prompt_embeds_to_prompt `staticmethod` ¶

_apply_prompt_embeds_to_prompt(
    prompt: DictPrompt,
    prompt_embeds_tensors: list[Tensor],
    placeholder_token_id: int,
) -> None

Mutate prompt from TokensPrompt to EmbedsPrompt shape.

Pure prompt_embeds path only (no other MM modalities). Expands each <prompt_embeds> sentinel token into an N-token span and builds the full-length prompt_embeds tensor + prompt_is_token_ids mask that the engine's enable_prompt_embeds worker branch consumes.

Source code in vllm/renderers/hf.py

@staticmethod
def _apply_prompt_embeds_to_prompt(
    prompt: DictPrompt,
    prompt_embeds_tensors: list[torch.Tensor],
    placeholder_token_id: int,
) -> None:
    """Mutate `prompt` from `TokensPrompt` to `EmbedsPrompt` shape.

    Pure `prompt_embeds` path only (no other MM modalities).  Expands
    each `<prompt_embeds>` sentinel token into an N-token span and builds
    the full-length `prompt_embeds` tensor + `prompt_is_token_ids` mask
    that the engine's `enable_prompt_embeds` worker branch consumes.
    """
    token_ids = cast(list[int] | None, prompt.get("prompt_token_ids"))
    if token_ids is None:
        raise RuntimeError(_MISSING_PROMPT_TOKEN_IDS_ERROR)

    embeds_orig_positions: list[int] = [
        i for i, tok in enumerate(token_ids) if tok == placeholder_token_id
    ]
    if len(embeds_orig_positions) != len(prompt_embeds_tensors):
        raise ValueError(
            f"Expected {len(prompt_embeds_tensors)} prompt_embeds "
            f"placeholder tokens in the rendered prompt, found "
            f"{len(embeds_orig_positions)}."
        )

    mm_updates = _build_prompt_embeds_updates(
        prompt_embeds_tensors, placeholder_token_id
    )
    expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
    positions = _build_prompt_embeds_positions(
        expanded, len(prompt_embeds_tensors), mm_updates
    )

    embeds_prompt = cast(EmbedsPrompt, prompt)
    embeds_prompt["prompt_token_ids"] = expanded
    full_embeds, is_token_ids_mask = _build_mixed_prompt_embeds(
        expanded, prompt_embeds_tensors, positions
    )
    embeds_prompt["prompt_embeds"] = full_embeds
    embeds_prompt["prompt_is_token_ids"] = is_token_ids_mask

_process_tokens ¶

_process_tokens(
    prompt: TokensPrompt, *, skip_mm_cache: bool = False
) -> TokensInput | MultiModalInput

Pre-expand prompt_embeds sentinels before delegating to the MM processor, then attach prompt_embeds modality data to the result.

Mixed mode only: the _prompt_embeds stash is set by render_messages when prompt_embeds co-exist with other MM data (images, audio, …). We expand each 1-token sentinel to an N-token span before calling super()._process_tokens() so the MM processor records all placeholder offsets in the final (post-expansion) coordinate space, no offset shifting needed afterwards.

Source code in vllm/renderers/hf.py

@override
def _process_tokens(
    self,
    prompt: TokensPrompt,
    *,
    skip_mm_cache: bool = False,
) -> TokensInput | MultiModalInput:
    """Pre-expand `prompt_embeds` sentinels before delegating to the MM
    processor, then attach `prompt_embeds` modality data to the result.

    Mixed mode only: the `_prompt_embeds` stash is set by
    `render_messages` when `prompt_embeds` co-exist with other MM data
    (images, audio, …).  We expand each 1-token sentinel to an N-token
    span *before* calling `super()._process_tokens()` so the MM
    processor records all placeholder offsets in the final (post-expansion)
    coordinate space, no offset shifting needed afterwards.
    """
    prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
    if prompt_embeds_info is not None:
        tensors, placeholder_token_id = prompt_embeds_info
        mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
        cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
            list(prompt["prompt_token_ids"]), mm_updates
        )
    engine_input = super()._process_tokens(prompt, skip_mm_cache=skip_mm_cache)
    if prompt_embeds_info is not None:
        tensors, _ = prompt_embeds_info
        self._apply_prompt_embeds_to_engine_input(
            cast(MultiModalInput, engine_input),
            tensors,
            mm_updates,
        )
    return engine_input

_process_tokens_async `async` ¶

_process_tokens_async(
    prompt: TokensPrompt, *, skip_mm_cache: bool = False
) -> TokensInput | MultiModalInput

Async equivalent of _process_tokens.

Source code in vllm/renderers/hf.py

@override
async def _process_tokens_async(
    self,
    prompt: TokensPrompt,
    *,
    skip_mm_cache: bool = False,
) -> TokensInput | MultiModalInput:
    """Async equivalent of `_process_tokens`."""
    prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
    if prompt_embeds_info is not None:
        tensors, placeholder_token_id = prompt_embeds_info
        mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
        cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
            list(prompt["prompt_token_ids"]), mm_updates
        )
    engine_input = await super()._process_tokens_async(
        prompt, skip_mm_cache=skip_mm_cache
    )
    if prompt_embeds_info is not None:
        tensors, _ = prompt_embeds_info
        self._apply_prompt_embeds_to_engine_input(
            cast(MultiModalInput, engine_input),
            tensors,
            mm_updates,
        )
    return engine_input

_build_mixed_prompt_embeds ¶

_build_mixed_prompt_embeds(
    token_ids: list[int],
    prompt_embeds_tensors: Sequence[Tensor],
    positions: list[tuple[int, int]],
) -> tuple[Tensor, list[bool]]

Build the full-length prompt_embeds tensor and the is_token_ids mask aligned to token_ids.

Source code in vllm/renderers/hf.py

def _build_mixed_prompt_embeds(
    token_ids: list[int],
    prompt_embeds_tensors: Sequence[torch.Tensor],
    positions: list[tuple[int, int]],
) -> tuple[torch.Tensor, list[bool]]:
    """Build the full-length `prompt_embeds` tensor and the `is_token_ids`
    mask aligned to `token_ids`."""
    total_len = len(token_ids)
    hidden_size = prompt_embeds_tensors[0].shape[1]
    dtype = prompt_embeds_tensors[0].dtype

    full_embeds = torch.zeros(total_len, hidden_size, dtype=dtype)
    is_token_ids = torch.ones(total_len, dtype=torch.bool)

    for (start, length), tensor in zip(positions, prompt_embeds_tensors, strict=True):
        full_embeds[start : start + length] = tensor
        is_token_ids[start : start + length] = False

    return full_embeds, is_token_ids.tolist()

_build_prompt_embeds_positions ¶

_build_prompt_embeds_positions(
    token_ids: list[int],
    num_tensors: int,
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[tuple[int, int]]

Locate each prompt_embeds placeholder span in token_ids.

Expects token_ids to already contain expanded N-token spans. Returns [(start_idx, length), ...] aligned with the tensors.

Source code in vllm/renderers/hf.py

def _build_prompt_embeds_positions(
    token_ids: list[int],
    num_tensors: int,
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[tuple[int, int]]:
    """Locate each prompt_embeds placeholder span in `token_ids`.

    Expects `token_ids` to already contain expanded N-token spans.
    Returns `[(start_idx, length), ...]` aligned with the tensors.
    """
    placeholders = find_mm_placeholders(
        prompt=token_ids,
        mm_prompt_updates=mm_prompt_updates,
        tokenizer=None,
    )
    features = placeholders.get("prompt_embeds", [])

    if len(features) != num_tensors:
        raise ValueError(
            _PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR.format(
                expected=num_tensors,
                actual=len(features),
            )
        )

    return [(f.start_idx, f.length) for f in features]

_build_prompt_embeds_updates ¶

_build_prompt_embeds_updates(
    prompt_embeds_tensors: Sequence[Tensor],
    placeholder_token_id: int,
) -> MultiModalPromptUpdates

Build MultiModalPromptUpdates for prompt_embeds expansion.

Each tensor produces a PromptReplacement that maps [placeholder_token_id] -> [placeholder_token_id] x N (where N = tensor.shape[0]).

Source code in vllm/renderers/hf.py

def _build_prompt_embeds_updates(
    prompt_embeds_tensors: Sequence[torch.Tensor],
    placeholder_token_id: int,
) -> MultiModalPromptUpdates:
    """Build `MultiModalPromptUpdates` for `prompt_embeds` expansion.

    Each tensor produces a `PromptReplacement` that maps
    `[placeholder_token_id]` -> `[placeholder_token_id] x N`
    (where `N = tensor.shape[0]`).
    """
    updates: list[Sequence[ResolvedPromptUpdate]] = []
    for i, tensor in enumerate(prompt_embeds_tensors):
        update = PromptReplacement(
            modality="prompt_embeds",
            target=[placeholder_token_id],
            replacement=[placeholder_token_id] * tensor.shape[0],
        )
        updates.append([update.resolve(item_idx=i)])
    return {"prompt_embeds": updates}

_ensure_prompt_embeds_placeholder_token ¶

_ensure_prompt_embeds_placeholder_token(
    tokenizer: HfTokenizer,
) -> int

Register PROMPT_EMBEDS_PLACEHOLDER_TOKEN as a special token and return its token ID.

Source code in vllm/renderers/hf.py

def _ensure_prompt_embeds_placeholder_token(tokenizer: HfTokenizer) -> int:
    """Register `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` as a special token and return
    its token ID."""
    cached = _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE.get(tokenizer)
    if cached is not None:
        return cached

    tokenizer.add_special_tokens(
        {"additional_special_tokens": [PROMPT_EMBEDS_PLACEHOLDER_TOKEN]}
    )

    ids = tokenizer.encode(PROMPT_EMBEDS_PLACEHOLDER_TOKEN, add_special_tokens=False)
    if len(ids) != 1:
        raise RuntimeError(
            _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_ERROR.format(
                token=PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
                num_ids=len(ids),
                ids=ids,
            )
        )

    token_id = ids[0]
    _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE[tokenizer] = token_id
    return token_id

_expand_prompt_embeds_placeholders ¶

_expand_prompt_embeds_placeholders(
    token_ids: list[int],
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[int]

Expand each 1-token prompt_embeds sentinel into an N-token span.

Uses apply_token_matches. Each single placeholder token in token_ids is replaced with a consecutive span of tensor.shape[0] copies, following tensors in order.

Source code in vllm/renderers/hf.py

def _expand_prompt_embeds_placeholders(
    token_ids: list[int],
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[int]:
    """Expand each 1-token `prompt_embeds` sentinel into an N-token span.

    Uses `apply_token_matches`.  Each single placeholder token in
    `token_ids` is replaced with a consecutive span of
    `tensor.shape[0]` copies, following tensors in order.
    """
    expanded, _ = apply_token_matches(token_ids, mm_prompt_updates, tokenizer=None)
    return expanded

build_video_prompts_from_mm_data ¶

build_video_prompts_from_mm_data(
    mm_data: MultiModalDataDict,
) -> list[str]

Build video prompts from vision_chunk data.

Collects prompts from video chunks and groups them by video_idx.

Parameters:

Name	Type	Description	Default
`mm_data`	`MultiModalDataDict`	Processed multimodal data with vision_chunk items	required

Returns:

Type	Description
`list[str]`	List of video prompts, one per video.

Source code in vllm/renderers/hf.py

def build_video_prompts_from_mm_data(
    mm_data: MultiModalDataDict,
) -> list[str]:
    """Build video prompts from vision_chunk data.

    Collects prompts from video chunks and groups them by video_idx.

    Args:
        mm_data: Processed multimodal data with vision_chunk items

    Returns:
        List of video prompts, one per video.
    """
    vision_chunks = mm_data.get("vision_chunk")
    if vision_chunks is None:
        return []

    # Group chunks by video_idx
    video_prompts_dict: dict[int, list[str]] = defaultdict(list)

    for item in vision_chunks:
        # vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
        assert isinstance(item, dict)
        if item.get("type") == "video_chunk":
            video_idx = item.get("video_idx", 0)
            prompt = item.get("prompt", "")
            video_prompts_dict[video_idx].append(prompt)

    # Build prompts in video order
    video_prompts = [
        "".join(video_prompts_dict[video_idx])
        for video_idx in sorted(video_prompts_dict.keys())
    ]

    return video_prompts

rebuild_mm_uuids_from_mm_data ¶

rebuild_mm_uuids_from_mm_data(
    mm_uuids: MultiModalUUIDDict,
    mm_data: MultiModalDataDict,
) -> MultiModalUUIDDict

Rebuild mm_uuids after vision_chunk processing.

When videos are split into chunks, the original UUIDs need to be updated to reflect the new UUIDs generated for each chunk.

Parameters:

Name	Type	Description	Default
`mm_uuids`	`MultiModalUUIDDict`	Original UUIDs dictionary	required
`mm_data`	`MultiModalDataDict`	Processed multimodal data with vision_chunk items	required

Returns:

Type	Description
`MultiModalUUIDDict`	Updated UUIDs dictionary with chunk UUIDs

Source code in vllm/renderers/hf.py

def rebuild_mm_uuids_from_mm_data(
    mm_uuids: MultiModalUUIDDict,
    mm_data: MultiModalDataDict,
) -> MultiModalUUIDDict:
    """Rebuild mm_uuids after vision_chunk processing.

    When videos are split into chunks, the original UUIDs need to be updated
    to reflect the new UUIDs generated for each chunk.

    Args:
        mm_uuids: Original UUIDs dictionary
        mm_data: Processed multimodal data with vision_chunk items

    Returns:
        Updated UUIDs dictionary with chunk UUIDs
    """
    vision_chunks = mm_data.get("vision_chunk")
    if vision_chunks is None:
        return mm_uuids

    assert all(isinstance(item, dict) for item in vision_chunks), (
        "Expected all vision_chunk items to be dicts"
    )
    vision_chunks = cast(list[dict[str, Any]], vision_chunks)
    vision_chunk_uuids = [
        uuid_val for item in vision_chunks if (uuid_val := item.get("uuid")) is not None
    ]

    if vision_chunk_uuids:
        mm_uuids = dict(mm_uuids)
        mm_uuids["vision_chunk"] = vision_chunk_uuids

    return mm_uuids

vllm.renderers.hf ¶

_PROCESSOR_CHAT_TEMPLATES module-attribute ¶

HfRenderer ¶

_apply_prompt_embeds_to_engine_input staticmethod ¶

_apply_prompt_embeds_to_prompt staticmethod ¶

_process_tokens ¶

_process_tokens_async async ¶

_build_mixed_prompt_embeds ¶

_build_prompt_embeds_positions ¶

_build_prompt_embeds_updates ¶

_ensure_prompt_embeds_placeholder_token ¶

_expand_prompt_embeds_placeholders ¶

build_video_prompts_from_mm_data ¶

rebuild_mm_uuids_from_mm_data ¶

_PROCESSOR_CHAT_TEMPLATES `module-attribute` ¶

_apply_prompt_embeds_to_engine_input `staticmethod` ¶

_apply_prompt_embeds_to_prompt `staticmethod` ¶

_process_tokens_async `async` ¶