Skip to content

vllm.renderers.hf

_PROCESSOR_CHAT_TEMPLATES module-attribute

_PROCESSOR_CHAT_TEMPLATES = dict[
    tuple[str, bool], str | None
]()

Used in _try_get_processor_chat_template to avoid calling cached_get_processor again if the processor fails to be loaded.

This is needed because lru_cache does not cache when an exception happens.

HfRenderer

Bases: BaseRenderer[HfTokenizer]

Source code in vllm/renderers/hf.py
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
class HfRenderer(BaseRenderer[HfTokenizer]):
    def __init__(
        self,
        config: VllmConfig,
        tokenizer: HfTokenizer | None,
    ) -> None:
        super().__init__(config, tokenizer)

        self.use_unified_vision_chunk = getattr(
            config.model_config.hf_config, "use_unified_vision_chunk", False
        )

        self._apply_chat_template_async = make_async(
            safe_apply_chat_template, executor=self._executor
        )

    def render_messages(
        self,
        messages: list[ChatCompletionMessageParam],
        params: ChatParams,
    ) -> tuple[list[ConversationMessage], DictPrompt]:
        model_config = self.model_config
        tokenizer = self.get_tokenizer()

        prompt_embeds_placeholder_token_id: int | None = None
        if model_config.enable_prompt_embeds:
            prompt_embeds_placeholder_token_id = (
                _ensure_prompt_embeds_placeholder_token(tokenizer)
            )

        conversation, mm_data, mm_uuids = parse_chat_messages(
            messages,
            model_config,
            content_format=resolve_chat_template_content_format(
                chat_template=params.chat_template,
                tools=params.chat_template_kwargs.get("tools"),
                given_format=params.chat_template_content_format,
                tokenizer=tokenizer,
                model_config=model_config,
            ),
            media_io_kwargs=params.media_io_kwargs,
            mm_processor_kwargs=params.mm_processor_kwargs,
        )

        # prompt_embeds tensors are carried by the tracker through mm_data,
        # but they must NOT be fed to the MM processor (which would reject
        # the unknown key). Extract them here.
        prompt_embeds_tensors: list[torch.Tensor] | None = None
        if mm_data is not None and "prompt_embeds" in mm_data:
            prompt_embeds_tensors = list(
                cast(Sequence[torch.Tensor], mm_data["prompt_embeds"])
            )
            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
            if not mm_data:
                mm_data = None

        chat_template_kwargs = params.get_apply_chat_template_kwargs()
        if prompt_embeds_tensors:
            # prompt_embeds post-processing requires prompt_token_ids.
            if chat_template_kwargs.get("tokenize") is False:
                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
            chat_template_kwargs["tokenize"] = True

        prompt_raw = safe_apply_chat_template(
            model_config,
            tokenizer,
            conversation,
            **chat_template_kwargs,
        )

        # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
        # model which uses unified vision chunks for both images and videos.
        if (
            self.use_unified_vision_chunk
            and mm_uuids is not None
            and mm_data is not None
        ):
            mm_uuids = rebuild_mm_uuids_from_mm_data(mm_uuids, mm_data)

            # get video placeholder, replace it with runtime video-chunk prompts
            video_placeholder = getattr(
                model_config.hf_config, "video_placeholder", None
            )
            prompt_raw = cast(
                list[int],
                replace_vision_chunk_video_placeholder(
                    prompt_raw,
                    mm_data,
                    video_placeholder,
                ),
            )

        prompt = parse_dec_only_prompt(prompt_raw)

        # When `prompt_embeds` is mixed with other modality data,
        # `_process_tokens` runs `_process_multimodal` first (expanding
        # `<|AUDIO|>` / `<|IMAGE|>` placeholders) and then
        # `_apply_prompt_embeds_to_engine_input` augments the result.
        # Stash the tensors and placeholder ID for that override to consume.
        if prompt_embeds_tensors and mm_data:
            assert prompt_embeds_placeholder_token_id is not None
            cast(dict, prompt)["_prompt_embeds"] = (
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )
            if params.mm_processor_kwargs:
                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
        elif prompt_embeds_tensors:
            # Pure mode: no other MM data, mutate prompt to EmbedsPrompt shape.
            assert prompt_embeds_placeholder_token_id is not None
            self._apply_prompt_embeds_to_prompt(
                prompt,
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )

        if mm_data is not None:
            prompt["multi_modal_data"] = mm_data
        if mm_uuids is not None:
            prompt["multi_modal_uuids"] = mm_uuids

        return conversation, prompt

    async def render_messages_async(
        self,
        messages: list[ChatCompletionMessageParam],
        params: ChatParams,
    ) -> tuple[list[ConversationMessage], DictPrompt]:
        model_config = self.model_config
        tokenizer = self.get_tokenizer()

        prompt_embeds_placeholder_token_id: int | None = None
        if model_config.enable_prompt_embeds:
            prompt_embeds_placeholder_token_id = (
                _ensure_prompt_embeds_placeholder_token(tokenizer)
            )

        conversation, mm_data, mm_uuids = await parse_chat_messages_async(
            messages,
            model_config,
            content_format=resolve_chat_template_content_format(
                chat_template=params.chat_template,
                tools=params.chat_template_kwargs.get("tools"),
                given_format=params.chat_template_content_format,
                tokenizer=tokenizer,
                model_config=model_config,
            ),
            media_io_kwargs=params.media_io_kwargs,
            mm_processor_kwargs=params.mm_processor_kwargs,
        )

        prompt_embeds_tensors: list[torch.Tensor] | None = None
        if mm_data is not None and "prompt_embeds" in mm_data:
            prompt_embeds_tensors = list(
                cast(Sequence[torch.Tensor], mm_data["prompt_embeds"])
            )
            mm_data = {k: v for k, v in mm_data.items() if k != "prompt_embeds"}
            if not mm_data:
                mm_data = None

        chat_template_kwargs = params.get_apply_chat_template_kwargs()
        if prompt_embeds_tensors:
            # prompt_embeds post-processing requires prompt_token_ids.
            if chat_template_kwargs.get("tokenize") is False:
                logger.warning_once(_TOKENIZE_OVERRIDE_WARNING)
            chat_template_kwargs["tokenize"] = True

        prompt_raw = await self._apply_chat_template_async(
            model_config,
            tokenizer,
            conversation,
            **chat_template_kwargs,
        )

        # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
        # model which uses unified vision chunks for both images and videos.
        if (
            self.use_unified_vision_chunk
            and mm_uuids is not None
            and mm_data is not None
        ):
            # get video placeholder, replace it with runtime video-chunk prompts
            video_placeholder = getattr(
                model_config.hf_config, "video_placeholder", None
            )
            prompt_raw = cast(
                list[int],
                replace_vision_chunk_video_placeholder(
                    prompt_raw,
                    mm_data,
                    video_placeholder,
                ),
            )

        prompt = parse_dec_only_prompt(prompt_raw)

        # See `render_messages` for the rationale.
        if prompt_embeds_tensors and mm_data:
            assert prompt_embeds_placeholder_token_id is not None
            cast(dict, prompt)["_prompt_embeds"] = (
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )
            if params.mm_processor_kwargs:
                cast(dict, prompt)["mm_processor_kwargs"] = params.mm_processor_kwargs
        elif prompt_embeds_tensors:
            assert prompt_embeds_placeholder_token_id is not None
            self._apply_prompt_embeds_to_prompt(
                prompt,
                prompt_embeds_tensors,
                prompt_embeds_placeholder_token_id,
            )

        if mm_data is not None:
            prompt["multi_modal_data"] = mm_data
        if mm_uuids is not None:
            prompt["multi_modal_uuids"] = mm_uuids

        return conversation, prompt

    @override
    def _process_tokens(
        self,
        prompt: TokensPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> TokensInput | MultiModalInput:
        """Pre-expand `prompt_embeds` sentinels before delegating to the MM
        processor, then attach `prompt_embeds` modality data to the result.

        Mixed mode only: the `_prompt_embeds` stash is set by
        `render_messages` when `prompt_embeds` co-exist with other MM data
        (images, audio, …).  We expand each 1-token sentinel to an N-token
        span *before* calling `super()._process_tokens()` so the MM
        processor records all placeholder offsets in the final (post-expansion)
        coordinate space, no offset shifting needed afterwards.
        """
        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
        if prompt_embeds_info is not None:
            tensors, placeholder_token_id = prompt_embeds_info
            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
                list(prompt["prompt_token_ids"]), mm_updates
            )
        engine_input = super()._process_tokens(prompt, skip_mm_cache=skip_mm_cache)
        if prompt_embeds_info is not None:
            tensors, _ = prompt_embeds_info
            self._apply_prompt_embeds_to_engine_input(
                cast(MultiModalInput, engine_input),
                tensors,
                mm_updates,
            )
        return engine_input

    @override
    async def _process_tokens_async(
        self,
        prompt: TokensPrompt,
        *,
        skip_mm_cache: bool = False,
    ) -> TokensInput | MultiModalInput:
        """Async equivalent of `_process_tokens`."""
        prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
        if prompt_embeds_info is not None:
            tensors, placeholder_token_id = prompt_embeds_info
            mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
            cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
                list(prompt["prompt_token_ids"]), mm_updates
            )
        engine_input = await super()._process_tokens_async(
            prompt, skip_mm_cache=skip_mm_cache
        )
        if prompt_embeds_info is not None:
            tensors, _ = prompt_embeds_info
            self._apply_prompt_embeds_to_engine_input(
                cast(MultiModalInput, engine_input),
                tensors,
                mm_updates,
            )
        return engine_input

    @staticmethod
    def _apply_prompt_embeds_to_prompt(
        prompt: DictPrompt,
        prompt_embeds_tensors: list[torch.Tensor],
        placeholder_token_id: int,
    ) -> None:
        """Mutate `prompt` from `TokensPrompt` to `EmbedsPrompt` shape.

        Pure `prompt_embeds` path only (no other MM modalities).  Expands
        each `<prompt_embeds>` sentinel token into an N-token span and builds
        the full-length `prompt_embeds` tensor + `prompt_is_token_ids` mask
        that the engine's `enable_prompt_embeds` worker branch consumes.
        """
        token_ids = cast(list[int] | None, prompt.get("prompt_token_ids"))
        if token_ids is None:
            raise RuntimeError(_MISSING_PROMPT_TOKEN_IDS_ERROR)

        embeds_orig_positions: list[int] = [
            i for i, tok in enumerate(token_ids) if tok == placeholder_token_id
        ]
        if len(embeds_orig_positions) != len(prompt_embeds_tensors):
            raise ValueError(
                f"Expected {len(prompt_embeds_tensors)} prompt_embeds "
                f"placeholder tokens in the rendered prompt, found "
                f"{len(embeds_orig_positions)}."
            )

        mm_updates = _build_prompt_embeds_updates(
            prompt_embeds_tensors, placeholder_token_id
        )
        expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
        positions = _build_prompt_embeds_positions(
            expanded, len(prompt_embeds_tensors), mm_updates
        )

        embeds_prompt = cast(EmbedsPrompt, prompt)
        embeds_prompt["prompt_token_ids"] = expanded
        full_embeds, is_token_ids_mask = _build_mixed_prompt_embeds(
            expanded, prompt_embeds_tensors, positions
        )
        embeds_prompt["prompt_embeds"] = full_embeds
        embeds_prompt["prompt_is_token_ids"] = is_token_ids_mask

    @staticmethod
    def _apply_prompt_embeds_to_engine_input(
        engine_input: MultiModalInput,
        prompt_embeds_tensors: list[torch.Tensor],
        mm_updates: MultiModalPromptUpdates,
    ) -> None:
        """Augment `engine_input` in-place with a `prompt_embeds` modality.

        Mixed mode: called after `_process_multimodal` has already run on the
        pre-expanded token IDs (expansion was done in `_process_tokens` before
        calling `super()`).  Locates the already-expanded `prompt_embeds` spans
        and adds `prompt_embeds` entries to `mm_kwargs`, `mm_hashes`, and
        `mm_placeholders`.
        """
        # token_ids already contain the pre-expanded N-token spans.
        token_ids = list(engine_input["prompt_token_ids"])

        positions = _build_prompt_embeds_positions(
            token_ids, len(prompt_embeds_tensors), mm_updates
        )

        pe_kwargs_items: list[MultiModalKwargsItem] = []
        pe_hashes: list[str] = []
        pe_placeholders: list[PlaceholderRange] = []
        for tensor, (start, length) in zip(
            prompt_embeds_tensors, positions, strict=True
        ):
            pe_kwargs_items.append(
                MultiModalKwargsItem(
                    {
                        "embedding": MultiModalFieldElem(
                            data=tensor,
                            field=MultiModalSharedField(batch_size=1),
                        )
                    }
                )
            )
            pe_hashes.append(MultiModalHasher.hash_kwargs(prompt_embeds=tensor))
            # `is_embed=None` matches the existing image_embeds-style
            # "no encoder, just splice the tensor directly" semantics.
            pe_placeholders.append(
                PlaceholderRange(offset=start, length=length, is_embed=None)
            )

        cast(
            MultiModalKwargsItems[MultiModalKwargsItem | None],
            engine_input["mm_kwargs"],
        )["prompt_embeds"] = pe_kwargs_items
        engine_input["mm_hashes"] = {
            **engine_input["mm_hashes"],
            "prompt_embeds": pe_hashes,
        }
        cast(dict, engine_input["mm_placeholders"])["prompt_embeds"] = pe_placeholders

_apply_prompt_embeds_to_engine_input staticmethod

_apply_prompt_embeds_to_engine_input(
    engine_input: MultiModalInput,
    prompt_embeds_tensors: list[Tensor],
    mm_updates: MultiModalPromptUpdates,
) -> None

Augment engine_input in-place with a prompt_embeds modality.

Mixed mode: called after _process_multimodal has already run on the pre-expanded token IDs (expansion was done in _process_tokens before calling super()). Locates the already-expanded prompt_embeds spans and adds prompt_embeds entries to mm_kwargs, mm_hashes, and mm_placeholders.

Source code in vllm/renderers/hf.py
@staticmethod
def _apply_prompt_embeds_to_engine_input(
    engine_input: MultiModalInput,
    prompt_embeds_tensors: list[torch.Tensor],
    mm_updates: MultiModalPromptUpdates,
) -> None:
    """Augment `engine_input` in-place with a `prompt_embeds` modality.

    Mixed mode: called after `_process_multimodal` has already run on the
    pre-expanded token IDs (expansion was done in `_process_tokens` before
    calling `super()`).  Locates the already-expanded `prompt_embeds` spans
    and adds `prompt_embeds` entries to `mm_kwargs`, `mm_hashes`, and
    `mm_placeholders`.
    """
    # token_ids already contain the pre-expanded N-token spans.
    token_ids = list(engine_input["prompt_token_ids"])

    positions = _build_prompt_embeds_positions(
        token_ids, len(prompt_embeds_tensors), mm_updates
    )

    pe_kwargs_items: list[MultiModalKwargsItem] = []
    pe_hashes: list[str] = []
    pe_placeholders: list[PlaceholderRange] = []
    for tensor, (start, length) in zip(
        prompt_embeds_tensors, positions, strict=True
    ):
        pe_kwargs_items.append(
            MultiModalKwargsItem(
                {
                    "embedding": MultiModalFieldElem(
                        data=tensor,
                        field=MultiModalSharedField(batch_size=1),
                    )
                }
            )
        )
        pe_hashes.append(MultiModalHasher.hash_kwargs(prompt_embeds=tensor))
        # `is_embed=None` matches the existing image_embeds-style
        # "no encoder, just splice the tensor directly" semantics.
        pe_placeholders.append(
            PlaceholderRange(offset=start, length=length, is_embed=None)
        )

    cast(
        MultiModalKwargsItems[MultiModalKwargsItem | None],
        engine_input["mm_kwargs"],
    )["prompt_embeds"] = pe_kwargs_items
    engine_input["mm_hashes"] = {
        **engine_input["mm_hashes"],
        "prompt_embeds": pe_hashes,
    }
    cast(dict, engine_input["mm_placeholders"])["prompt_embeds"] = pe_placeholders

_apply_prompt_embeds_to_prompt staticmethod

_apply_prompt_embeds_to_prompt(
    prompt: DictPrompt,
    prompt_embeds_tensors: list[Tensor],
    placeholder_token_id: int,
) -> None

Mutate prompt from TokensPrompt to EmbedsPrompt shape.

Pure prompt_embeds path only (no other MM modalities). Expands each <prompt_embeds> sentinel token into an N-token span and builds the full-length prompt_embeds tensor + prompt_is_token_ids mask that the engine's enable_prompt_embeds worker branch consumes.

Source code in vllm/renderers/hf.py
@staticmethod
def _apply_prompt_embeds_to_prompt(
    prompt: DictPrompt,
    prompt_embeds_tensors: list[torch.Tensor],
    placeholder_token_id: int,
) -> None:
    """Mutate `prompt` from `TokensPrompt` to `EmbedsPrompt` shape.

    Pure `prompt_embeds` path only (no other MM modalities).  Expands
    each `<prompt_embeds>` sentinel token into an N-token span and builds
    the full-length `prompt_embeds` tensor + `prompt_is_token_ids` mask
    that the engine's `enable_prompt_embeds` worker branch consumes.
    """
    token_ids = cast(list[int] | None, prompt.get("prompt_token_ids"))
    if token_ids is None:
        raise RuntimeError(_MISSING_PROMPT_TOKEN_IDS_ERROR)

    embeds_orig_positions: list[int] = [
        i for i, tok in enumerate(token_ids) if tok == placeholder_token_id
    ]
    if len(embeds_orig_positions) != len(prompt_embeds_tensors):
        raise ValueError(
            f"Expected {len(prompt_embeds_tensors)} prompt_embeds "
            f"placeholder tokens in the rendered prompt, found "
            f"{len(embeds_orig_positions)}."
        )

    mm_updates = _build_prompt_embeds_updates(
        prompt_embeds_tensors, placeholder_token_id
    )
    expanded = _expand_prompt_embeds_placeholders(token_ids, mm_updates)
    positions = _build_prompt_embeds_positions(
        expanded, len(prompt_embeds_tensors), mm_updates
    )

    embeds_prompt = cast(EmbedsPrompt, prompt)
    embeds_prompt["prompt_token_ids"] = expanded
    full_embeds, is_token_ids_mask = _build_mixed_prompt_embeds(
        expanded, prompt_embeds_tensors, positions
    )
    embeds_prompt["prompt_embeds"] = full_embeds
    embeds_prompt["prompt_is_token_ids"] = is_token_ids_mask

_process_tokens

_process_tokens(
    prompt: TokensPrompt, *, skip_mm_cache: bool = False
) -> TokensInput | MultiModalInput

Pre-expand prompt_embeds sentinels before delegating to the MM processor, then attach prompt_embeds modality data to the result.

Mixed mode only: the _prompt_embeds stash is set by render_messages when prompt_embeds co-exist with other MM data (images, audio, …). We expand each 1-token sentinel to an N-token span before calling super()._process_tokens() so the MM processor records all placeholder offsets in the final (post-expansion) coordinate space, no offset shifting needed afterwards.

Source code in vllm/renderers/hf.py
@override
def _process_tokens(
    self,
    prompt: TokensPrompt,
    *,
    skip_mm_cache: bool = False,
) -> TokensInput | MultiModalInput:
    """Pre-expand `prompt_embeds` sentinels before delegating to the MM
    processor, then attach `prompt_embeds` modality data to the result.

    Mixed mode only: the `_prompt_embeds` stash is set by
    `render_messages` when `prompt_embeds` co-exist with other MM data
    (images, audio, …).  We expand each 1-token sentinel to an N-token
    span *before* calling `super()._process_tokens()` so the MM
    processor records all placeholder offsets in the final (post-expansion)
    coordinate space, no offset shifting needed afterwards.
    """
    prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
    if prompt_embeds_info is not None:
        tensors, placeholder_token_id = prompt_embeds_info
        mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
        cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
            list(prompt["prompt_token_ids"]), mm_updates
        )
    engine_input = super()._process_tokens(prompt, skip_mm_cache=skip_mm_cache)
    if prompt_embeds_info is not None:
        tensors, _ = prompt_embeds_info
        self._apply_prompt_embeds_to_engine_input(
            cast(MultiModalInput, engine_input),
            tensors,
            mm_updates,
        )
    return engine_input

_process_tokens_async async

_process_tokens_async(
    prompt: TokensPrompt, *, skip_mm_cache: bool = False
) -> TokensInput | MultiModalInput

Async equivalent of _process_tokens.

Source code in vllm/renderers/hf.py
@override
async def _process_tokens_async(
    self,
    prompt: TokensPrompt,
    *,
    skip_mm_cache: bool = False,
) -> TokensInput | MultiModalInput:
    """Async equivalent of `_process_tokens`."""
    prompt_embeds_info = cast(dict, prompt).pop("_prompt_embeds", None)
    if prompt_embeds_info is not None:
        tensors, placeholder_token_id = prompt_embeds_info
        mm_updates = _build_prompt_embeds_updates(tensors, placeholder_token_id)
        cast(dict, prompt)["prompt_token_ids"] = _expand_prompt_embeds_placeholders(
            list(prompt["prompt_token_ids"]), mm_updates
        )
    engine_input = await super()._process_tokens_async(
        prompt, skip_mm_cache=skip_mm_cache
    )
    if prompt_embeds_info is not None:
        tensors, _ = prompt_embeds_info
        self._apply_prompt_embeds_to_engine_input(
            cast(MultiModalInput, engine_input),
            tensors,
            mm_updates,
        )
    return engine_input

_build_mixed_prompt_embeds

_build_mixed_prompt_embeds(
    token_ids: list[int],
    prompt_embeds_tensors: Sequence[Tensor],
    positions: list[tuple[int, int]],
) -> tuple[Tensor, list[bool]]

Build the full-length prompt_embeds tensor and the is_token_ids mask aligned to token_ids.

Source code in vllm/renderers/hf.py
def _build_mixed_prompt_embeds(
    token_ids: list[int],
    prompt_embeds_tensors: Sequence[torch.Tensor],
    positions: list[tuple[int, int]],
) -> tuple[torch.Tensor, list[bool]]:
    """Build the full-length `prompt_embeds` tensor and the `is_token_ids`
    mask aligned to `token_ids`."""
    total_len = len(token_ids)
    hidden_size = prompt_embeds_tensors[0].shape[1]
    dtype = prompt_embeds_tensors[0].dtype

    full_embeds = torch.zeros(total_len, hidden_size, dtype=dtype)
    is_token_ids = torch.ones(total_len, dtype=torch.bool)

    for (start, length), tensor in zip(positions, prompt_embeds_tensors, strict=True):
        full_embeds[start : start + length] = tensor
        is_token_ids[start : start + length] = False

    return full_embeds, is_token_ids.tolist()

_build_prompt_embeds_positions

_build_prompt_embeds_positions(
    token_ids: list[int],
    num_tensors: int,
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[tuple[int, int]]

Locate each prompt_embeds placeholder span in token_ids.

Expects token_ids to already contain expanded N-token spans. Returns [(start_idx, length), ...] aligned with the tensors.

Source code in vllm/renderers/hf.py
def _build_prompt_embeds_positions(
    token_ids: list[int],
    num_tensors: int,
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[tuple[int, int]]:
    """Locate each prompt_embeds placeholder span in `token_ids`.

    Expects `token_ids` to already contain expanded N-token spans.
    Returns `[(start_idx, length), ...]` aligned with the tensors.
    """
    placeholders = find_mm_placeholders(
        prompt=token_ids,
        mm_prompt_updates=mm_prompt_updates,
        tokenizer=None,
    )
    features = placeholders.get("prompt_embeds", [])

    if len(features) != num_tensors:
        raise ValueError(
            _PROMPT_EMBEDS_PLACEHOLDER_SPAN_MISMATCH_ERROR.format(
                expected=num_tensors,
                actual=len(features),
            )
        )

    return [(f.start_idx, f.length) for f in features]

_build_prompt_embeds_updates

_build_prompt_embeds_updates(
    prompt_embeds_tensors: Sequence[Tensor],
    placeholder_token_id: int,
) -> MultiModalPromptUpdates

Build MultiModalPromptUpdates for prompt_embeds expansion.

Each tensor produces a PromptReplacement that maps [placeholder_token_id] -> [placeholder_token_id] x N (where N = tensor.shape[0]).

Source code in vllm/renderers/hf.py
def _build_prompt_embeds_updates(
    prompt_embeds_tensors: Sequence[torch.Tensor],
    placeholder_token_id: int,
) -> MultiModalPromptUpdates:
    """Build `MultiModalPromptUpdates` for `prompt_embeds` expansion.

    Each tensor produces a `PromptReplacement` that maps
    `[placeholder_token_id]` -> `[placeholder_token_id] x N`
    (where `N = tensor.shape[0]`).
    """
    updates: list[Sequence[ResolvedPromptUpdate]] = []
    for i, tensor in enumerate(prompt_embeds_tensors):
        update = PromptReplacement(
            modality="prompt_embeds",
            target=[placeholder_token_id],
            replacement=[placeholder_token_id] * tensor.shape[0],
        )
        updates.append([update.resolve(item_idx=i)])
    return {"prompt_embeds": updates}

_ensure_prompt_embeds_placeholder_token

_ensure_prompt_embeds_placeholder_token(
    tokenizer: HfTokenizer,
) -> int

Register PROMPT_EMBEDS_PLACEHOLDER_TOKEN as a special token and return its token ID.

Source code in vllm/renderers/hf.py
def _ensure_prompt_embeds_placeholder_token(tokenizer: HfTokenizer) -> int:
    """Register `PROMPT_EMBEDS_PLACEHOLDER_TOKEN` as a special token and return
    its token ID."""
    cached = _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE.get(tokenizer)
    if cached is not None:
        return cached

    tokenizer.add_special_tokens(
        {"additional_special_tokens": [PROMPT_EMBEDS_PLACEHOLDER_TOKEN]}
    )

    ids = tokenizer.encode(PROMPT_EMBEDS_PLACEHOLDER_TOKEN, add_special_tokens=False)
    if len(ids) != 1:
        raise RuntimeError(
            _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_ERROR.format(
                token=PROMPT_EMBEDS_PLACEHOLDER_TOKEN,
                num_ids=len(ids),
                ids=ids,
            )
        )

    token_id = ids[0]
    _PROMPT_EMBEDS_PLACEHOLDER_TOKEN_ID_CACHE[tokenizer] = token_id
    return token_id

_expand_prompt_embeds_placeholders

_expand_prompt_embeds_placeholders(
    token_ids: list[int],
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[int]

Expand each 1-token prompt_embeds sentinel into an N-token span.

Uses apply_token_matches. Each single placeholder token in token_ids is replaced with a consecutive span of tensor.shape[0] copies, following tensors in order.

Source code in vllm/renderers/hf.py
def _expand_prompt_embeds_placeholders(
    token_ids: list[int],
    mm_prompt_updates: MultiModalPromptUpdates,
) -> list[int]:
    """Expand each 1-token `prompt_embeds` sentinel into an N-token span.

    Uses `apply_token_matches`.  Each single placeholder token in
    `token_ids` is replaced with a consecutive span of
    `tensor.shape[0]` copies, following tensors in order.
    """
    expanded, _ = apply_token_matches(token_ids, mm_prompt_updates, tokenizer=None)
    return expanded

build_video_prompts_from_mm_data

build_video_prompts_from_mm_data(
    mm_data: MultiModalDataDict,
) -> list[str]

Build video prompts from vision_chunk data.

Collects prompts from video chunks and groups them by video_idx.

Parameters:

Name Type Description Default
mm_data MultiModalDataDict

Processed multimodal data with vision_chunk items

required

Returns:

Type Description
list[str]

List of video prompts, one per video.

Source code in vllm/renderers/hf.py
def build_video_prompts_from_mm_data(
    mm_data: MultiModalDataDict,
) -> list[str]:
    """Build video prompts from vision_chunk data.

    Collects prompts from video chunks and groups them by video_idx.

    Args:
        mm_data: Processed multimodal data with vision_chunk items

    Returns:
        List of video prompts, one per video.
    """
    vision_chunks = mm_data.get("vision_chunk")
    if vision_chunks is None:
        return []

    # Group chunks by video_idx
    video_prompts_dict: dict[int, list[str]] = defaultdict(list)

    for item in vision_chunks:
        # vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
        assert isinstance(item, dict)
        if item.get("type") == "video_chunk":
            video_idx = item.get("video_idx", 0)
            prompt = item.get("prompt", "")
            video_prompts_dict[video_idx].append(prompt)

    # Build prompts in video order
    video_prompts = [
        "".join(video_prompts_dict[video_idx])
        for video_idx in sorted(video_prompts_dict.keys())
    ]

    return video_prompts

rebuild_mm_uuids_from_mm_data

rebuild_mm_uuids_from_mm_data(
    mm_uuids: MultiModalUUIDDict,
    mm_data: MultiModalDataDict,
) -> MultiModalUUIDDict

Rebuild mm_uuids after vision_chunk processing.

When videos are split into chunks, the original UUIDs need to be updated to reflect the new UUIDs generated for each chunk.

Parameters:

Name Type Description Default
mm_uuids MultiModalUUIDDict

Original UUIDs dictionary

required
mm_data MultiModalDataDict

Processed multimodal data with vision_chunk items

required

Returns:

Type Description
MultiModalUUIDDict

Updated UUIDs dictionary with chunk UUIDs

Source code in vllm/renderers/hf.py
def rebuild_mm_uuids_from_mm_data(
    mm_uuids: MultiModalUUIDDict,
    mm_data: MultiModalDataDict,
) -> MultiModalUUIDDict:
    """Rebuild mm_uuids after vision_chunk processing.

    When videos are split into chunks, the original UUIDs need to be updated
    to reflect the new UUIDs generated for each chunk.

    Args:
        mm_uuids: Original UUIDs dictionary
        mm_data: Processed multimodal data with vision_chunk items

    Returns:
        Updated UUIDs dictionary with chunk UUIDs
    """
    vision_chunks = mm_data.get("vision_chunk")
    if vision_chunks is None:
        return mm_uuids

    assert all(isinstance(item, dict) for item in vision_chunks), (
        "Expected all vision_chunk items to be dicts"
    )
    vision_chunks = cast(list[dict[str, Any]], vision_chunks)
    vision_chunk_uuids = [
        uuid_val for item in vision_chunks if (uuid_val := item.get("uuid")) is not None
    ]

    if vision_chunk_uuids:
        mm_uuids = dict(mm_uuids)
        mm_uuids["vision_chunk"] = vision_chunk_uuids

    return mm_uuids